In [None]:
# !conda install bertopic
# pip install pyspellchecker
# pip install vaderSentiment
# !pip install contractions

In [None]:
import pandas as pd
import numpy as np
import os
from spellchecker import SpellChecker
import nltk
import string
nltk.download('averaged_perceptron_tagger')
import contractions
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

In [None]:
clean_reviews = pd.read_csv('../../data/processed/added_features.csv')
clean_reviews.head()

In [None]:
clean_reviews_orginal = clean_reviews.copy()

# POS Tags
Assigned:
1. No. of nouns
2. No. of verbs
3. Cardinal digits

In [None]:
clean_reviews['tokenized'] = clean_reviews['cleaned_text'].str.lower().apply(nltk.word_tokenize)
clean_reviews['tokenized_raw'] = clean_reviews['text'].str.lower().apply(nltk.word_tokenize)

In [None]:
clean_reviews

In [None]:
clean_reviews['tagged'] = clean_reviews['tokenized'].apply(nltk.pos_tag)

In [None]:
clean_reviews[['tagged']].head()

In [None]:
clean_reviews['nouns'] = clean_reviews['tagged'].apply(lambda x: [word for word, tag in x if tag in ['NN', 'NNS', 'NNP', 'NNPS']])

In [None]:
clean_reviews

In [None]:
clean_reviews['verbs'] = clean_reviews['tagged'].apply(lambda x: [word for word, tag in x if tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']])

In [None]:
clean_reviews[['cleaned_text', 'verbs']].head()

In [None]:
# extract the numbers or cardinal digits from the text
clean_reviews['cardinal_digits'] = clean_reviews['tagged'].apply(lambda x: [word for word, tag in x if tag in ['CD']])

In [None]:
clean_reviews[['cleaned_text', 'cardinal_digits']].head()

In [None]:
clean_reviews['cleaned_text'][2]

In [None]:
clean_reviews['num_digits'] = clean_reviews['cardinal_digits'].str.len()
clean_reviews['num_verbs'] = clean_reviews['verbs'].str.len()
clean_reviews['num_nouns'] = clean_reviews['nouns'].str.len()
clean_reviews['num_tokens_cleaned'] = clean_reviews['tokenized'].str.len()
clean_reviews['num_tokens_raw'] = clean_reviews['tokenized_raw'].str.len()

clean_reviews

In [None]:
to_drop = ['tokenized', 'tagged', 'nouns', 'verbs', 'cardinal_digits']
clean_reviews.drop(to_drop, axis=1, inplace=True)

In [None]:
clean_reviews.head()

# Find Number of Typos


In [None]:
# view andn check typos function
def spell3(s):
    #remove punctuations 
    s = s.translate(str.maketrans('', '', string.punctuation)) 
    
    #expand contractions
    expanded_words = []  
    for word in s.split():
        expanded_words.append(contractions.fix(word))  # using contractions.fix to expand the shortened words
        
    spell = SpellChecker()
    amount_miss = len(list(spell.unknown(expanded_words)))
#     print(list(spell.unknown(s.split())))
#     print("Possible amount of misspelled words in the text:", amount_miss)
    return amount_miss


In [None]:
test = clean_reviews[['text']].head(30)

In [None]:
clean_reviews['num_words_misspelled'] = clean_reviews['text'].apply(spell3)

In [None]:
clean_reviews.head()

# Polarity of Text
Using VADER as it does better sentiment analysis when it comes to negative polarity detection, as compared to TextBlob. (source: https://www.analyticsvidhya.com/blog/2021/10/sentiment-analysis-with-textblob-and-vader/)

In [None]:
clean_reviews['text'][0]

In [None]:
clean_reviews['cleaned_text'][0]

In [None]:
sid_obj= SentimentIntensityAnalyzer()

In [None]:
print(sid_obj.polarity_scores(clean_reviews.text[0])) 

In [None]:
sid_obj.polarity_scores(clean_reviews['cleaned_text'][0])['compound']

To get compound polarity score on the cleaned text is sufficient.

In [None]:
def compound_polarity_score(text):
    sid_obj= SentimentIntensityAnalyzer()
    score = sid_obj.polarity_scores(text)['compound']
    return score

In [None]:
clean_reviews['polarity'] = clean_reviews['cleaned_text'].apply(compound_polarity_score)

In [None]:
clean_reviews.head()

In [None]:
clean_reviews.polarity.mean() #makes sense, since we have more +ve sentiment reviews

# Subjectivity 
A higher subjectivity score means it is less objective, and therefore would be highly opinionated.

In [None]:
def getSubjectivity(text):
   return TextBlob(text).sentiment.subjectivity

In [None]:
clean_reviews['subjectivity'] = clean_reviews['cleaned_text'].apply(getSubjectivity)
clean_reviews

In [None]:
clean_reviews.head()

# Number of +ve, -ve, neutral words

In [None]:
test = clean_reviews['cleaned_text'][40]
test

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def count_pos_neg_neutral(text):
    text_split = text.split()
    sid = SentimentIntensityAnalyzer()
    pos_word_list=[]
    neu_word_list=[]
    neg_word_list=[]

    for word in text_split:
        if (sid.polarity_scores(word)['compound']) >= 0.5:
            pos_word_list.append(word)
        elif (sid.polarity_scores(word)['compound']) <= -0.5:
            neg_word_list.append(word)
        else:
            neu_word_list.append(word)      
    return [len(pos_word_list), len(neg_word_list), len(neu_word_list)]


In [None]:
clean_reviews['num_pos_neg_neutral_words'] = clean_reviews['cleaned_text'].apply(count_pos_neg_neutral)
clean_reviews.head() # a lot of neutral words, not very insightful, keep +ve and -ve enough

In [None]:
clean_reviews['num_pos_words'] = clean_reviews['num_pos_neg_neutral_words'].str[0]
clean_reviews['num_neg_words'] = clean_reviews['num_pos_neg_neutral_words'].str[1]
clean_reviews

In [None]:
clean_reviews.columns

In [None]:
clean_reviews.drop(['num_pos_neg_neutral_words', 'tokenized_raw'], axis=1, inplace=True)

In [None]:
clean_reviews.columns

In [None]:
clean_reviews.to_csv('../../data/processed/added_features2.csv')