# Aspect Based Sentiment Analysis

In [5]:
import nltk
nltk.download('punkt')

import pandas as pd
from nltk.tokenize import sent_tokenize
import spacy
import string

from flair.models import TextClassifier
from flair.data import Sentence
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Rules for Aspect Extraction

In [6]:
def rule1(token):
    # rules are adapted from https://github.com/ishikaarora/Aspect-Sentiment-Analysis-on-Amazon-Reviews/blob/master/src/models/aspect_extraction.py
    # example rule 1: "There has not been poorer quality of teaching." -> ("teaching quality", "poorer", negation: True)
    
    # initialize aspect and sentiment to random string
    aspect = "999999"
    sentiment = "999999"
    negation = False
    rule1_pairs = []
    
    # find adjectival modifier
    if token.dep_ == "amod":
        sentiment = token.text
        aspect = token.head.text

        # aspects that are composed of multiple words with "of"
        if token.head.dep_ == "attr":
            for child_of_head in token.head.children:
                if child_of_head.text == "of":
                    for j in child_of_head.children:
                        aspect = j.text + " " + aspect

        # aspects that are composed of multiple words
        if token.head.dep_ == "attr":
            for child_of_head in token.head.children:
                if child_of_head.dep_ == "compound":
                    aspect = child_of_head.text + " " + aspect

        # add adverbial modifier of adjective (e.g. 'most comfortable headphones')
        sentiment_children = token.children
        for sentiment_child in sentiment_children:
            if(sentiment_child.dep_ == "advmod"):
                sentiment_child_text = sentiment_child.text
                sentiment = sentiment_child_text + " " + sentiment

        # negation of amod
        for child_of_head in token.head.children:
            if  child_of_head.dep_ == "neg":
                negation = True

        # negation before
        if token.head.dep_ == "attr":
            for child_of_head_head in token.head.head.children:
                if child_of_head_head.dep_ == "neg":
                    negation = True
                
                # account for sentences like "audio could have been better"
                if child_of_head_head.dep_ == "aux" and child_of_head_head.tag_ == "MD":
                    negation = True

    if(aspect != "999999" and sentiment != "999999"):
        #print("R1:", (aspect, sentiment, negation))
        rule1_pairs.append((aspect.lower(), sentiment.lower(), negation))
        
    return rule1_pairs

In [7]:
def rule2(token):
    # example rule 2: "The teaching could have been much better." -> ("teaching", "much better", negation: True)

    rule2_pairs = []
    # initialize aspect and sentiment to random string
    aspect = "999999"
    sentiment = "999999"
    negation = False
    
    # token is nominal subject
    if token.dep_ == "nsubj":
        aspect = token.text
        
        # aspects that are composed of multiple words with "of"
        for child in token.children:
            if child.text == "of":
                for child_child in child.children:
                    aspect = child_child.text + " " + aspect
        
        # aspects that are composed of multiple words
        for child in token.children:
            if child.dep_ == "compound":
                aspect = child.text + " " + aspect
        
        # get sentiment and negation
        for child_of_head in token.head.children:
            if child_of_head.dep_ == "acomp":
                sentiment = child_of_head.text
                for child_of_child_of_head in child_of_head.children:
                    if(child_of_child_of_head.dep_ == "advmod"):
                        sentiment_child_text = child_of_child_of_head.text
                        sentiment = sentiment_child_text + " " + sentiment

            if child_of_head.dep_ == "neg":
                negation = True
            
            # account for sentences like "audio could have been better"
            if child_of_head.dep_ == "aux" and child_of_head.tag_ == "MD":
                negation = True
    
    if(aspect != "999999" and sentiment != "999999"):
        #print("R2:", (aspect, sentiment, negation))
        rule2_pairs.append((aspect.lower(), sentiment.lower(), negation))
        
    return rule2_pairs
     

In [8]:
def rule3(token):
    # example rule 3: "It has been difficult to form an overview." -> ("overview", "difficult", negation: False)
    rule3_pairs = []
    
    # initialize aspect and sentiment to random string
    aspect = "999999"
    sentiment = "999999"
    negation = False
    
    # token is nominal subject
    if token.dep_ == "dobj":
        aspect = token.text
        
        # aspects that are composed of multiple words with "of"
        for child in token.children:
            if child.text == "of":
                for child_child in child.children:
                    aspect = child_child.text + " " + aspect
        
        # aspects that are composed of multiple words
        for child in token.children:
            if child.dep_ == "compound":
                aspect = child.text + " " + aspect
        
        # get sentiment and negation
        for child_of_head_head in token.head.head.children:
            if child_of_head_head.dep_ == "acomp":
                sentiment = child_of_head_head.text
                for child_of_child_of_head in child_of_head_head.children:
                    if(child_of_child_of_head.dep_ == "advmod"):
                        sentiment_child_text = child_of_child_of_head.text
                        sentiment = sentiment_child_text + " " + sentiment

                
            if child_of_head_head.dep_ == "neg":
                negation = True
    
    if(aspect != "999999" and sentiment != "999999"):
        rule3_pairs.append((aspect, sentiment, negation))
        #print("R3:", (aspect.lower(), sentiment.lower(), negation))
    
    return rule3_pairs

### Import data

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
# Read the csv as a dataframe
df = pd.read_csv("/content/drive/MyDrive/code_file/ABSA/syria_political.csv")

In [11]:
df = df.sample(n=2500, random_state=42)


### Preprocessing

In [13]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import contractions

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_absa(text):
    """
    Performs preprocessing for ABSA.

    text -- string or pd.Series with the different texts
    returns -- string
    """

    if isinstance(text, str):
        text = pd.Series(text)

    # split up into sentences
    sentences = [sent for i in text for sent in sent_tokenize(i)]

    # expand contractions
    sentences = [contractions.fix(sent) for sent in sentences]

    # lowercase all
    sentences = [sent.lower() for sent in sentences]

    # add "div" to stopwords
    stop_words = set(stopwords.words('english'))

    # remove stopwords
    sentences = [' '.join(word for word in sent.split() if word not in stop_words) for sent in sentences]

    # return text
    return " ".join(sentences)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
df['clean_text'] = df['ABSA_text'].apply(preprocess_absa)


### Extract Aspects and Sentiments

In [15]:

# Load the language model
nlp = spacy.load('en_core_web_sm')

# Apply the aspect extraction rules to each row of the dataframe
def extract_aspects(row):
    doc = nlp(row['clean_text'])
    aspect_pairs = []
    for token in doc:
        aspect_pairs += rule1(token)
        aspect_pairs += rule2(token)
        aspect_pairs += rule3(token)
    return pd.DataFrame(aspect_pairs, columns=['aspect', 'sentiment', 'negation']).assign(id=row['id'])

# Convert df to a DataFrame object if it is a Series object
if isinstance(df, pd.Series):
    df = df.to_frame()

# Create a new dataframe with the extracted information
extracted_df = pd.concat([extract_aspects(row) for _, row in df.iterrows()], ignore_index=True)


In [16]:
extracted_df

Unnamed: 0,aspect,sentiment,negation,id
0,judge,federal,False,2017-01-28FederaljudgestaysdeportationsunderTr...
1,visas,valid,False,2017-01-28FederaljudgestaysdeportationsunderTr...
2,majority,muslim,False,2017-01-28FederaljudgestaysdeportationsunderTr...
3,order,partial,False,2017-01-28FederaljudgestaysdeportationsunderTr...
4,order,broader,False,2017-01-28FederaljudgestaysdeportationsunderTr...
...,...,...,...,...
239464,son,middle,False,2014-10-22FatherofIShostageJohnCantliedies
239465,invasion,japanese,False,2014-10-22FatherofIShostageJohnCantliedies
239466,war,second,False,2014-10-22FatherofIShostageJohnCantliedies
239467,infrastructure,much,False,2014-10-22FatherofIShostageJohnCantliedies


### Assign Flair score based on sentiment

In [17]:
# Load the flair model
classifier = TextClassifier.load('en-sentiment')

def get_aspect_sentiment(row):
    sentence = Sentence(row['sentiment'])
    classifier.predict(sentence)

    if abs(sentence.labels[0].score) >= 0.9:
        
        # get classification
        if (str(sentence.labels[0].value) == 'POSITIVE'):
            score = 1
        else:
            score = -1
        
        #check negation
        if row['negation']:
            return -score
        else:
            return score
    
    else:
        return 0

2023-05-01 14:48:05,156 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert_4.pt not found in cache, downloading to /tmp/tmpn9dmy6fg


100%|██████████| 253M/253M [00:22<00:00, 11.9MB/s]

2023-05-01 14:48:28,083 copying /tmp/tmpn9dmy6fg to cache at /root/.flair/models/sentiment-en-mix-distillbert_4.pt
2023-05-01 14:48:28,282 removing temp file /tmp/tmpn9dmy6fg





Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [18]:
# Apply the get_aspect_sentiment function to the dataframe
extracted_df['sentiment_score'] = extracted_df.apply(get_aspect_sentiment, axis=1)

In [19]:
# Print the resulting dataframe
extracted_df

Unnamed: 0,aspect,sentiment,negation,id,sentiment_score
0,judge,federal,False,2017-01-28FederaljudgestaysdeportationsunderTr...,0
1,visas,valid,False,2017-01-28FederaljudgestaysdeportationsunderTr...,1
2,majority,muslim,False,2017-01-28FederaljudgestaysdeportationsunderTr...,0
3,order,partial,False,2017-01-28FederaljudgestaysdeportationsunderTr...,-1
4,order,broader,False,2017-01-28FederaljudgestaysdeportationsunderTr...,1
...,...,...,...,...,...
239464,son,middle,False,2014-10-22FatherofIShostageJohnCantliedies,-1
239465,invasion,japanese,False,2014-10-22FatherofIShostageJohnCantliedies,0
239466,war,second,False,2014-10-22FatherofIShostageJohnCantliedies,1
239467,infrastructure,much,False,2014-10-22FatherofIShostageJohnCantliedies,1


In [20]:
extracted_df.to_csv("/content/drive/MyDrive/code_file/ABSA/_syria_political_ABSA.csv")

In [21]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [22]:
from textblob import TextBlob
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
lmtzr = WordNetLemmatizer()

# group by lemmatized aspects, excluding 'div'
extracted_df['Aspect_lemm'] = extracted_df['aspect'].apply(lambda x: " ".join(
    [lmtzr.lemmatize(word) for word in word_tokenize(x)]))


    #Add word counts of lemmatized aspects
grouped_count = pd.DataFrame(extracted_df.groupby('Aspect_lemm')['Aspect_lemm'].count())
    
grouped_mean = pd.DataFrame(extracted_df.groupby('Aspect_lemm')['sentiment_score'].mean())
    
grouped_polarity = pd.DataFrame(extracted_df.groupby('Aspect_lemm')['sentiment_score'].sum())
    
grouped_aspects = grouped_polarity.join(grouped_count)
grouped_mean = grouped_mean[['sentiment_score']]
    
grouped_mean.rename(columns = {'sentiment_score':'Mean_polarity'}, inplace = True)
grouped_aspects = pd.concat([grouped_aspects, grouped_mean], axis =1)
grouped_aspects.rename(columns = {'Aspect_lemm':'Counts', 'sentiment_score': 'Sum_polarity'}, inplace = True)
    
    


In [23]:
grouped_aspects

Unnamed: 0_level_0,Sum_polarity,Counts,Mean_polarity
Aspect_lemm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
',1,1,1.000000
-,0,16,0.000000
-- gt,2,2,1.000000
-- supporting,38,57,0.666667
-action,2,2,1.000000
...,...,...,...
zonea,2,4,0.500000
zonesa,2,2,1.000000
zor,3,3,1.000000
zora,1,1,1.000000


In [24]:
grouped_aspects_sorted = grouped_aspects.sort_values('Counts', ascending=False).head(20)
grouped_aspects_sorted

Unnamed: 0_level_0,Sum_polarity,Counts,Mean_polarity
Aspect_lemm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
minister,681,3144,0.216603
government,410,3042,0.13478
time,1750,2603,0.672301
country,911,2569,0.354613
people,372,2493,0.149218
force,555,2205,0.251701
refugee,-221,2061,-0.10723
year,458,1922,0.238293
president,253,1772,0.142777
right,1392,1667,0.835033


In [25]:
grouped_aspects_sorted.to_csv("/content/drive/MyDrive/code_file/ABSA/syria_political_grouped_ABSA.csv")