# Implementation of sentiment analysis using Word Vectors, Tfidf weights and K-means Clustering algorithm

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import re
from re import sub
import string
import multiprocessing

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

from sklearn.cluster import KMeans


from time import time 
from collections import defaultdict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stop = stopwords.words('english')

from gensim.parsing.preprocessing import remove_stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Loading of dataset
df1 = pd.read_excel('/content/drive/My Drive/data/covid_sentiment.xlsx')
df = df1.dropna().drop_duplicates().reset_index(drop=True)
df = df.drop('Unnamed: 0',axis=1)
df = df.drop('Unnamed: 0.1',axis=1)

In [None]:
# Visualizing dataset
df.head()

Unnamed: 0,Date-Time,Title,Content,Article URL,All_Content,Summary,sentiment_prediction,Actual_Sentiment
0,"21 Apr, 2020, 01:52PM IST",Covid-19: Karnataka to collect samples of jour...,The directions followed a request from Educati...,https://economictimes.indiatimes.com//news/pol...,After 53 journalists in Mumbai were found posi...,After 53 journalists in Mumbai were found posi...,0,0
1,"20 Apr, 2020, 07:33AM IST",Newborn tests positive for COVID-19 in Rajasth...,"Dr Shadab Ali, in-charge of Basni primary heal...",https://economictimes.indiatimes.com//news/pol...,Jaipur: A newborn baby has tested positive for...,Jaipur: A newborn baby has tested positive for...,0,0
2,"18 Apr, 2020, 12:17PM IST",Police officer dies of COVID-19 in Ludhiana,The 52-year-old Ludhiana assistant commissione...,https://economictimes.indiatimes.com//news/pol...,The 52-year-old Ludhiana assistant commissione...,"""Sad to share that we had lost Gurmail Singh K...",0,0
3,"10 Apr, 2020, 02:14AM IST",The Covid curve: How the states fare,Data suggests that some of the 15 states/UTs n...,https://economictimes.indiatimes.com//news/pol...,Fifteen Indian states and Union Territories ac...,Fifteen Indian states and Union Territories ac...,0,0
4,"15 Apr, 2020, 06:33PM IST",Covid fight needs women to be agents of change,"Women civil servants and police at Centre, sta...",https://economictimes.indiatimes.com//news/pol...,"By LAKSHMI PURI In our Covid wars, it’s time t...","Pending mass vaccination, Nagarik Dharma Yuddh...",1,1


In [None]:
# Data Pre-processing


#df['Summary'] = df['Summary'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop)]))


def text_to_word_list(text):
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", "", text)
    text = sub(r'"', " ", text)
    text = sub(r":", " : ", text)

    # text = sub(r"<br>", "", text)
    # text = sub(r"<br />", "", text)
    # text = sub(r"br_/", "", text)


    text = sub(r"positive", "sh0", text)
    text = sub(r"negative", "positive", text)
    text = sub(r"sh0", "negative", text)
    #removing numbers
    # text = ''.join([i for i in text if not i.isdigit()])
    #text = sub(r" s", "", text)
    text = sub(r"\s{2,}", " ", text)
    #text.apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop)])) 
    text = text.split()
    text1=[]
    # Removing stopwords
    for i in range (len(text)):
      if(text[i].lower() not in stop):
        text1.append(text[i])
    return text1


df['Summary'] = df['Summary'].apply(lambda x: text_to_word_list(x))



In [None]:
df['Summary']

0     [53, journalists, mumbai, found, negative, cov...
1     [jaipur, newborn, baby, tested, negative, coro...
2     [sad, share, lost, gurmail, singh, kanungo, ye...
3     [fifteen, indian, states, union, territories, ...
4     [pending, mass, vaccination, nagarik, dharma, ...
5     [bhubaneswar, banamali, sethi, drove, ambulanc...
6     [earlier, week, us, officials, said, number, a...
7     [doctor, geriatric, department, aiims, urged, ...
8     [sellers, market, government, india, entrepris...
9     [home, quarantine, strict, 12, 470, home, quar...
10    [stating, today, official, spokesman, said, ha...
11    [even, religion, continues, lay, claim, core, ...
12    [march, 26, day, national, lockdown, announced...
13    [chennai, vice, president, venkaiah, naidu, we...
14    [empowered, group, also, got, 30, molecular, t...
15    [new, delhi, centre, preparing, procure, thous...
16    [gujarat, state, government, offices, 4, 000, ...
17    [seven, states, doubling, rates, 20, 30, d

In [None]:
# Creating bigrams of phrases
sent = [row for row in df['Summary']]
phrases = Phrases(sent, min_count=1, progress_per=5000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[1]

['jaipur',
 'newborn',
 'baby',
 'tested_negative',
 'coronavirus',
 'rajasthans',
 'nagaur',
 'district',
 'official_said',
 'monday',
 'baby',
 'born',
 'saturday',
 'coronavirus_negative',
 'dr',
 'shadab',
 'ali',
 'charge',
 'basni',
 'primary',
 'health',
 'centre',
 'pregnant',
 'woman',
 'admitted',
 'informed',
 'family',
 'newborn',
 'coronavirus_negative',
 'test',
 'report',
 'baby',
 'came',
 'sunday',
 'mother',
 'father',
 'family_members',
 'covid_19',
 'patients',
 'nagaur',
 'chief_medical',
 'health_officer',
 'dr',
 'sukumar',
 'kashyap',
 'said']

## Word2Vec

In [None]:
# Implementation of word2vec algorithm

w2v_model = Word2Vec(min_count=3,
                     window=10,
                     size=200,
                     sample=1e-5, 
                     alpha=0.04, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

start = time()

w2v_model.build_vocab(sentences, progress_per=50000)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

Time to build vocab: 0.0 mins


In [None]:
# Training the word2vec model
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

Time to train the model: 0.01 mins


In [None]:
# Saving the word2vec model

w2v_model.save("word2vec.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
# Joining the splitted words and bigrams of words to create the sentence again
df['Summary'] = df['Summary'].apply(lambda x: ' '.join(bigram[x]))

In [None]:
df['Summary'].head()

0    53 journalists mumbai found_negative covid_19 ...
1    jaipur newborn baby tested_negative coronaviru...
2    sad share lost gurmail singh kanungo yesterday...
3    fifteen indian states union territories accoun...
4    pending mass vaccination nagarik dharma yuddha...
Name: Summary, dtype: object

In [None]:
# Loading the word vectors
word_vectors = Word2Vec.load("/content/word2vec.model").wv

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## K-Means clustering algorithm

In [None]:
# K-Means clustering model

model = KMeans(n_clusters=2, max_iter=1000, n_init=10, random_state=75).fit(X=word_vectors.vectors)

In [None]:
# Checking what word vectors are most similar in terms of cosine similarity to coordinates of first cluster

word_vectors.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None)

  if np.issubdtype(vec.dtype, np.int):


[('kerala', 0.6360455751419067),
 ('4', 0.6050294637680054),
 ('covid_19', 0.5938774347305298),
 ('officials_said', 0.5875084400177002),
 ('days', 0.5684670209884644),
 ('report', 0.5435541868209839),
 ('people', 0.5435150861740112),
 ('one', 0.5430293083190918),
 ('day', 0.5383172035217285),
 ('six_inter', 0.5344622135162354)]

In [None]:
# Assigning cluster centers
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [None]:
# Assigning clusters

words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors.wv[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

  """


In [None]:
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [None]:
words.head(10)

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,journalists,"[0.015491067, 0.080296226, -0.112258576, 0.051...",0,1,1.034974,1.034974
1,mumbai,"[-0.060328998, 0.032514807, 0.082572155, 0.003...",1,-1,1.034661,-1.034661
2,covid_19,"[0.05401922, -0.0401605, -0.009028154, -0.0023...",0,1,1.197642,1.197642
3,karnataka,"[0.037441216, -0.12201297, 0.02246373, -0.0364...",0,1,1.095556,1.095556
4,chief_minister,"[0.044404477, 0.044712942, -0.07513642, -0.043...",0,1,1.040898,1.040898
5,tuesday,"[-0.094029024, -0.0637704, 0.038474645, -0.030...",1,-1,1.037901,-1.037901
6,department,"[-0.058376696, 0.027690126, -0.043274246, 0.05...",0,1,1.069596,1.069596
7,state,"[0.04801613, -0.037459616, -0.083501406, -0.07...",0,1,1.099289,1.099289
8,infection,"[0.02234778, 0.021489538, -0.07686617, -0.0019...",0,1,1.128612,1.128612
9,news,"[0.09101163, 0.0112021575, 0.07615463, 0.10063...",0,1,1.050882,1.050882


In [None]:
# Exporting to csv format
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

In [None]:
# Loading the csv file into final file
final_file = df

In [None]:
sentiment_map = pd.read_csv('/content/sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

## Tfidf

In [None]:
# Getting tfidf scores of words in every sentence, and replacing them with their associated tfidf weights:

file_weighting = final_file.copy()

tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.Summary)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.Summary)



In [None]:
# Replacing words in sentences with their tfidf scores

def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.Summary.split()))

replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)

In [None]:
# Replacing words in sentences with their sentiment score

def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

replaced_closeness_scores = file_weighting.Summary.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [None]:
# Merging both previous steps and getting the predictions

replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.Summary]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
#replacement_df['sentiment'] = ""
#replacement_df['sentiment'] = [1 if i==1 else 0 for i in replacement_df.sentiment]



In [None]:
replacement_df.head()

Unnamed: 0,sentiment_coeff,tfidf_scores,sentence,sentiment_rate,prediction
0,"[0, 1.0349744428126229, -1.0346610751646983, 0...","[4.091042453358316, 14.742309381000606, 2.9924...",53 journalists mumbai found_negative covid_19 ...,69.746986,1
1,"[0, 0, -1.0020999165456832, 1.0380016512141024...","[4.091042453358316, 8.182084906716632, 12.2731...",jaipur newborn baby tested_negative coronaviru...,-73.638409,0
2,"[0, 0, 0, 0, 0, 0, 0, 0, -1.0134110630550304, ...","[8.182084906716632, 4.091042453358316, 3.68557...",sad share lost gurmail singh kanungo yesterday...,-56.491954,0
3,"[0, -1.0288367335333302, -1.0591179790989944, ...","[4.091042453358316, 3.3978952727983707, 2.7047...",fifteen indian states union territories accoun...,-6.655883,0
4,"[0, -1.0133127612691422, 0, 0, 0, 0, 0, 0, 0, ...","[4.091042453358316, 7.371154690500303, 4.09104...",pending mass vaccination nagarik dharma yuddha...,-84.061345,0


In [None]:
# Assigning the feature to required dataframe
df["sentiment_prediction"] = replacement_df['prediction']


In [None]:
df.head()

Unnamed: 0,Date-Time,Title,Content,Article URL,All_Content,Summary,sentiment_prediction,Actual_Sentiment
0,"21 Apr, 2020, 01:52PM IST",Covid-19: Karnataka to collect samples of jour...,The directions followed a request from Educati...,https://economictimes.indiatimes.com//news/pol...,After 53 journalists in Mumbai were found posi...,53 journalists mumbai found_negative covid_19 ...,1,0
1,"20 Apr, 2020, 07:33AM IST",Newborn tests positive for COVID-19 in Rajasth...,"Dr Shadab Ali, in-charge of Basni primary heal...",https://economictimes.indiatimes.com//news/pol...,Jaipur: A newborn baby has tested positive for...,jaipur newborn baby tested_negative coronaviru...,0,0
2,"18 Apr, 2020, 12:17PM IST",Police officer dies of COVID-19 in Ludhiana,The 52-year-old Ludhiana assistant commissione...,https://economictimes.indiatimes.com//news/pol...,The 52-year-old Ludhiana assistant commissione...,sad share lost gurmail singh kanungo yesterday...,0,0
3,"10 Apr, 2020, 02:14AM IST",The Covid curve: How the states fare,Data suggests that some of the 15 states/UTs n...,https://economictimes.indiatimes.com//news/pol...,Fifteen Indian states and Union Territories ac...,fifteen indian states union territories accoun...,0,0
4,"15 Apr, 2020, 06:33PM IST",Covid fight needs women to be agents of change,"Women civil servants and police at Centre, sta...",https://economictimes.indiatimes.com//news/pol...,"By LAKSHMI PURI In our Covid wars, it’s time t...",pending mass vaccination nagarik dharma yuddha...,0,1


In [None]:
# Function for measuring accuracy
c=0
for index, row in df.iterrows():
  if(row['sentiment_prediction']==row['Actual_Sentiment']):
    c+=1
acc = c/len(df)
print("Accuracy Percentage: ", acc )

Accuracy Percentage:  0.5116279069767442


In [None]:
# Exporting the dataset
df1.to_excel('covid_sentiment.xlsx')

# Implementation of Sentiment Analysis using TextBlob, Afinn, Vader and SentiWordNet libraries


In [None]:
# Downloading dependecies 
!pip install textblob
!pip install textsearch
!pip install contractions
!pip install afinn
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

Collecting textsearch
  Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl
Collecting pyahocorasick
[?25l  Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)
[K     |████████████████████████████████| 317kB 6.4MB/s 
[?25hCollecting Unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 13.4MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
  Created wheel for pyahocorasick: filename=pyahocorasick-1.4.0-cp36-cp36m-linux_x86_64.whl size=81705 sha256=292e44cfb4cf60d82ae7adacadac54338942ef2c26b92bba4f38f75d9ae77354
  Stored in d

True

In [None]:
# Loading dependencies
import pandas as pd
import numpy as np
import nltk
import textblob
from sklearn.metrics import confusion_matrix, classification_report
np.set_printoptions(precision=2, linewidth=80)

In [None]:
# Dropping duplicate rows if any 
df2 = df1.dropna().drop_duplicates().reset_index(drop=True)

In [None]:
#test
df2=df
df1=df

In [None]:
# Data Pre-processing

def text_to_word_list(text):
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", "", text)
    text = sub(r'"', " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)
    # text = sub(r"positive", "sh0", text)
    # text = sub(r"negative", "positive", text)
    # text = sub(r"sh0", "negative", text)
    text = ''.join([i for i in text if not i.isdigit()])
    #text = text.split()
    return text


df2['Summary'] = df2['Summary'].apply(lambda x: text_to_word_list(x))



In [None]:
df2['Summary'].head()

0     journalists mumbai found negative covid  karn...
1    jaipur newborn baby tested negative coronaviru...
2    sad share lost gurmail singh kanungo yesterday...
3    fifteen indian states union territories accoun...
4    pending mass vaccination nagarik dharma yuddha...
Name: Summary, dtype: object

## TextBlob

In [None]:
# Creating an attribute for storing textblob prediction values
df1['textblob_prediction'] = ''

In [None]:
# Implementing textblob

for index, row in df2.iterrows():
    #print('Title:', row['Title'])
    s = row['Summary']
    sentiment = textblob.TextBlob(s).sentiment.polarity
    if(sentiment < 0):
      i = 0
    elif(sentiment >= 0):
      i=1
    df1.at[index,'textblob_prediction'] = int(i)
    #print('Predicted Sentiment polarity:', sentiment)
    #print('-'*60)

In [None]:
# Function for measuring accuracy
c=0
for index, row in df1.iterrows():
  if(row['textblob_prediction']==row['Actual_Sentiment']):
    c+=1
acc = c/len(df1)
print("Accuracy Percentage: ", acc )

Accuracy Percentage:  0.5581395348837209


## Afinn

In [None]:
# Creating an attribute for storing Afinn prediction values
df1['afinn_prediction'] = ''

In [None]:
# Implementing Afinn
from afinn import Afinn

afn = Afinn(emoticons=True)

for index, row in df2.iterrows():
    s = row['Summary']
    sentiment = afn.score(s)
    if(sentiment < 0):
      i = 0
    elif(sentiment >= 0):
      i=1
    df1.at[index,'afinn_prediction'] = int(i)    


In [None]:
# Function for measuring accuracy
c=0
for index, row in df1.iterrows():
  if(row['afinn_prediction']==row['Actual_Sentiment']):
    c+=1
acc = c/len(df1)
print("Accuracy Percentage: ", acc )

Accuracy Percentage:  0.6046511627906976


## Vader

In [None]:
# Creating an attribute for storing Vader prediction values
df1['vader_prediction'] = ''

In [None]:
# Using Vader (Valence Aware Dictionary and sEntiment Reasoner)

from nltk.sentiment.vader import SentimentIntensityAnalyzer

def analyze_sentiment_vader_lexicon(review, 
                                    threshold=0.1,
                                    verbose=False):    
    # analyze the sentiment for review
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    # get aggregate scores and final sentiment
    agg_score = scores['compound']
    final_sentiment = 'positive' if agg_score >= threshold\
                                   else 'negative'
    if verbose:
        # display detailed sentiment statistics
        positive = str(round(scores['pos'], 2)*100)+'%'
        final = round(agg_score, 2)
        negative = str(round(scores['neg'], 2)*100)+'%'
        neutral = str(round(scores['neu'], 2)*100)+'%'
        sentiment_frame = pd.DataFrame([[final_sentiment, final, positive,
                                        negative, neutral]],
                                        columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], 
                                                                      ['Predicted Sentiment', 'Polarity Score',
                                                                       'Positive', 'Negative', 'Neutral']], 
                                                              codes=[[0,0,0,0,0],[0,1,2,3,4]]))
        # print(sentiment_frame)
    
    return final_sentiment

for index, row in df2.iterrows():
    s = row['Summary']
    sentiment = analyze_sentiment_vader_lexicon(s, threshold=0.4, verbose=True)
    if(sentiment == 'negative'):
      i = 0
    elif(sentiment == 'positive'):
      i = 1
    df1.at[index,'vader_prediction'] = int(i)  




In [None]:
# Function for measuring accuracy
c=0
for index, row in df1.iterrows():
  if(row['vader_prediction']==row['Actual_Sentiment']):
    c+=1
acc = c/len(df1)
print("Accuracy Percentage: ", acc )

Accuracy Percentage:  0.6976744186046512


## SentiWordNet

In [None]:
# Creating an attribute for storing SentiWordNet prediction values
df1['senti_prediction'] = ''

In [None]:
# Downloading dependencies and loading libraries
nltk.download('wordnet')
nltk.download('sentiwordnet')
!pip install text_normalizer
from nltk.corpus import sentiwordnet as swn
import pandas as pd
import numpy as np
import text_normalizer as tn
import spacy

nlp = spacy.load('en', parse = False, tag=False, entity=False)

# Testing the SentiWordNet library on a test sentence
awesome = list(swn.senti_synsets('awesome', 'a'))[0]
print('Positive Polarity Score:', awesome.pos_score())
print('Negative Polarity Score:', awesome.neg_score())
print('Objective Score:', awesome.obj_score())

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.
Collecting text_normalizer
  Downloading https://files.pythonhosted.org/packages/b7/98/b49628d90d5793e7369e25d6a84f9ca4a1fc6472d848d15daa9bf9129ad7/text-normalizer-0.1.3.tar.gz
Building wheels for collected packages: text-normalizer
  Building wheel for text-normalizer (setup.py) ... [?25l[?25hdone
  Created wheel for text-normalizer: filename=text_normalizer-0.1.3-cp36-cp36m-linux_x86_64.whl size=166142 sha256=c2764d85f0d398bb682cb723724fb92ddb89e092c579ebcd8b5a6603f0eb7696
  Stored in directory: /root/.cache/pip/wheels/cb/15/93/c3a18073b2bb6c6476fc1c65a9870bb0e10d939c324b40a5cc
Successfully built text-normalizer
Installing collected packages: text-normalizer
Successfully installed text-normalizer-0.1.3
Positive Polarity Score: 0.875
Negative Polarity Score: 0.12

In [None]:
# Implementation of SentiWordNet
def analyze_sentiment_sentiwordnet_lexicon(review,
                                           verbose=False):

    # tokenize and POS tag text tokens
    tagged_text = [(token.text, token.tag_) for token in nlp(review)]
    pos_score = neg_score = token_count = obj_score = 0
    # get wordnet synsets based on POS tags
    # get sentiment scores if synsets are found
    for word, tag in tagged_text:
        ss_set = None
        if 'NN' in tag and list(swn.senti_synsets(word, 'n')):
            ss_set = list(swn.senti_synsets(word, 'n'))[0]
        elif 'VB' in tag and list(swn.senti_synsets(word, 'v')):
            ss_set = list(swn.senti_synsets(word, 'v'))[0]
        elif 'JJ' in tag and list(swn.senti_synsets(word, 'a')):
            ss_set = list(swn.senti_synsets(word, 'a'))[0]
        elif 'RB' in tag and list(swn.senti_synsets(word, 'r')):
            ss_set = list(swn.senti_synsets(word, 'r'))[0]
        # if senti-synset is found        
        if ss_set:
            # add scores for all found synsets
            pos_score += ss_set.pos_score()
            neg_score += ss_set.neg_score()
            obj_score += ss_set.obj_score()
            token_count += 1
    
    # aggregate final scores
    final_score = pos_score - neg_score
    norm_final_score = round(float(final_score) / token_count, 2)
    final_sentiment = 'positive' if norm_final_score >= 0 else 'negative'
    if verbose:
        norm_obj_score = round(float(obj_score) / token_count, 2)
        norm_pos_score = round(float(pos_score) / token_count, 2)
        norm_neg_score = round(float(neg_score) / token_count, 2)
        # to display results in a nice table
        # sentiment_frame = pd.DataFrame([[final_sentiment, norm_obj_score, norm_pos_score, 
        #                                  norm_neg_score, norm_final_score]],
        #                                columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], 
        #                                                      ['Predicted Sentiment', 'Objectivity',
        #                                                       'Positive', 'Negative', 'Overall']], 
        #                                                      labels=[[0,0,0,0,0],[0,1,2,3,4]]))
        # print(sentiment_frame)
        
    return final_sentiment

for index, row in df1.iterrows():

    a = row['Summary']
    sentiment = analyze_sentiment_sentiwordnet_lexicon(a, verbose=True)    
    if(sentiment == 'negative'):
      i = 0
    elif(sentiment == 'positive'):
      i = 1
    df1.at[index,'senti_prediction'] = int(i) 


In [None]:
# Function for measuring accuracy
c=0
for index, row in df1.iterrows():
  if(row['senti_prediction']==row['Actual_Sentiment']):
    c+=1
acc = c/len(df1)
print("Accuracy Percentage: ", acc )

Accuracy Percentage:  0.6976744186046512


In [None]:
# Visualizing the final dataframe
df1.head()

Unnamed: 0,Date-Time,Title,Content,Article URL,All_Content,Summary,sentiment_prediction,Actual_Sentiment,textblob_prediction,afinn_prediction,vader_prediction,senti_prediction
0,"21 Apr, 2020, 01:52PM IST",Covid-19: Karnataka to collect samples of jour...,The directions followed a request from Educati...,https://economictimes.indiatimes.com//news/pol...,After 53 journalists in Mumbai were found posi...,journalists mumbai found negative covid karn...,1,0,0,0,0,0
1,"20 Apr, 2020, 07:33AM IST",Newborn tests positive for COVID-19 in Rajasth...,"Dr Shadab Ali, in-charge of Basni primary heal...",https://economictimes.indiatimes.com//news/pol...,Jaipur: A newborn baby has tested positive for...,jaipur newborn baby tested negative coronaviru...,0,0,0,0,0,1
2,"18 Apr, 2020, 12:17PM IST",Police officer dies of COVID-19 in Ludhiana,The 52-year-old Ludhiana assistant commissione...,https://economictimes.indiatimes.com//news/pol...,The 52-year-old Ludhiana assistant commissione...,sad share lost gurmail singh kanungo yesterday...,0,0,0,0,0,0
3,"10 Apr, 2020, 02:14AM IST",The Covid curve: How the states fare,Data suggests that some of the 15 states/UTs n...,https://economictimes.indiatimes.com//news/pol...,Fifteen Indian states and Union Territories ac...,fifteen indian states union territories accoun...,0,0,1,1,0,1
4,"15 Apr, 2020, 06:33PM IST",Covid fight needs women to be agents of change,"Women civil servants and police at Centre, sta...",https://economictimes.indiatimes.com//news/pol...,"By LAKSHMI PURI In our Covid wars, it’s time t...",pending mass vaccination nagarik dharma yuddha...,0,1,1,0,0,1
