In [10]:
import spacy
import re
from functools import reduce
import pandas as pd
from collections import Counter

nlp = spacy.load('en_coref_md')

In [9]:
with open('data/programming_languages.txt') as f:
    languages = f.read().split('\n')

with open("./data/neg_words.txt", 'r', encoding = "ISO-8859-1") as f:
    neg = list(map(lambda x: x.strip(), f.readlines()))
with open("./data/pos_words.txt", 'r', encoding = "ISO-8859-1") as f:
    pos  = list(map(lambda x: x.strip(), f.readlines()))
opinion_words = neg + pos

In [23]:
fin_df = pd.read_csv('./data/df_hni_2018_05.csv', low_memory=False)

# read lists from csv
for list_col in \
    ['text_sentences', 'text_words', 'title_words', 'lengths_of_text_words', 'lengths_of_title_words']:
    fin_df[list_col] = fin_df[list_col].apply(eval)

In [11]:
big_inapropriate_string = ''
for lang_orig in languages:
    big_inapropriate_string += ''.join(re.findall(pattern=re.compile("[\W]"), string=lang_orig))

inapropriate_syms = sorted(list(set(big_inapropriate_string)))

inapropriate_syms_dict = dict(zip(inapropriate_syms, 
['Whitespace', 'Exclamation', 'Sharp',  
 'Apostrophe', 'Bracket', 'Bracket',   
 'Star', 'Plus', 'Comma', 'Hyphen', 
 'Dot', 'Slash', 'Colon', 
 'Dog',  'Dash',  'Apostrophe']))

In [14]:
inapropriate_syms_dict

{' ': 'Whitespace',
 '!': 'Exclamation',
 '#': 'Sharp',
 "'": 'Apostrophe',
 '(': 'Bracket',
 ')': 'Bracket',
 '*': 'Star',
 '+': 'Plus',
 ',': 'Comma',
 '-': 'Hyphen',
 '.': 'Dot',
 '/': 'Slash',
 ':': 'Colon',
 '@': 'Dog',
 '–': 'Dash',
 '′': 'Apostrophe'}

In [16]:
languages  = list(map(lambda x: x.strip(), languages))

lang_new = []
for lang_orig in languages:
    
    lang_new.append(
        reduce(lambda x, kv: x.replace(*kv), inapropriate_syms_dict.items(), lang_orig)
    )
    
languages_dict = dict(zip(languages, lang_new))

In [24]:
fin_df['all_words'] = fin_df['text_words']  + fin_df['title_words']

fin_df.loc[fin_df['text'].isnull(), 'text'] = ''

fin_df.loc[fin_df['title'].isnull(), 'title'] = ''

fin_df['full_text'] = fin_df['text'] + fin_df['title'] 

fin_df['full_text2'] = fin_df['full_text'].apply(lambda txt: preproc_text(txt))

In [41]:
all_score_dict = dict(zip(neg, [-1 for i in range(len(neg))]))
pos_score_dict = dict(zip(pos, [1 for i in range(len(pos))]))

all_score_dict.update(pos_score_dict)

In [20]:
def preproc_text(text):
    text = re.sub(r'<\w+>', ' ', text)
    text = reduce(lambda x, kv: x.replace(*kv), languages_dict.items(), text)
    return text

def replace_pronouns(text):
    return nlp(text)._.coref_resolved

def split_sentence(text):
    '''
    splits review into a list of sentences using nlp's sentence parser
    '''
    review = nlp(text)
    bag_sentence = []
    start = 0
    for token in review:
        if token.sent_start:
            bag_sentence.append(review[start:(token.i-1)])
            start = token.i
        if token.i == len(review)-1:
            bag_sentence.append(review[start:(token.i+1)])
    return bag_sentence

# Remove special characters using regex
def remove_special_char(sentence):
    return re.sub(r"[^a-zA-Z0-9.',:;?]+", ' ', sentence)

def feature_sentiment(sentence):
    '''
    input: dictionary and sentence
    function: appends dictionary with new features if the feature did not exist previously,
              then updates sentiment to each of the new or existing features
    output: updated dictionary
    '''

    sent_dict = Counter()
    sentence = nlp(sentence)

    for token in sentence:
        if token.text not in opinion_words:
            continue

        if (token.dep_ == "advmod"):
            continue
        
        sentiment = 1 if token.text in pos else -1
        
        if (token.dep_ == "amod"):
            print(
                'Token "{token}" dependency is AMOD, incrementing key "{head_text}" by {sentiment}'\
                .format(
                    token=token.text,
                    head_text=token.head.text,
                    sentiment=sentiment
                )
            )
            sent_dict[token.head.text] += sentiment
            continue
        
        print('Processing token "{token}" children #1'.format(token=token.text))
        for child in token.children:
            if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                print('Child "{child}" dependency is {dep} and child is OPINION WORD'\
                    .format(
                        child=child.text,
                        dep=child.dep_.upper()
                    )
                )
                print('Multiplying sentiment by 1.5')
                sentiment *= 1.5
            elif child.dep_ == "neg":
                print('Child "{child}" dependency is {dep}'\
                    .format(
                        child=child.text,
                        dep=child.dep_.upper()
                    )
                )
                print('Multiplying sentiment by -1')
                sentiment *= -1
        
        print('Processing token "{token}" children #2'.format(token=token.text))
        for child in token.children:
            if (token.pos_ == "VERB") and (child.dep_ == "dobj"):   
                print('Token is a VERB and child "{child}" dependency is DOBJ, incrementing key "{child}" by {sentiment}'\
                    .format(
                        child=child.text,
                        sentiment=sentiment
                    )
                )
                sent_dict[child.text] += sentiment
                
                # check for conjugates (a AND b), then add both to dictionary
                subchildren = []
                conj = False
                
                for subchild in child.children:
                    if subchild.text == "and":
                        conj = True
                    
                    if conj and (subchild.text != "and"):
                        print('Found "and {sub}" conjurgate of {child}'.format(
                            sub=subchild.text,
                            child=child.text
                        ))
                        subchildren.append(subchild.text)
                        conj = False
                
                for subchild in subchildren:
                    print('Incrementing key "{sub}" by {sentiment}'.format(
                        sub=subchild.text,
                        sentiment=sentiment
                    ))
                    sent_dict[subchild] += sentiment
    
        # check for negation
        print('Processing tokens "{token}" head "{head}" children #1'.format(
            token=token.text,
            head=token.head.text
        ))
        for child in token.head.children:
            if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                sentiment *= 1.5
            elif (child.dep_ == "neg"):
                sentiment *= -1

        # check for negation
        print('Processing tokens "{token}" head "{head}" children #2'.format(
            token=token.text,
            head=token.head.text
        ))
        print(list(map(lambda x: x.text, token.head.children)))
        for child in token.head.children:
            noun = ""

            if (child.pos_ == "NOUN") and (child.text not in sent_dict):
                noun = child.text
                
                for subchild in child.children:
                    if subchild.dep_ == "compound":
                        noun = subchild.text + " " + noun
                
                print('Incrementing key "{noun}" by {sentiment}'.format(
                    noun=noun,
                    sentiment=sentiment
                ))
                sent_dict[noun] += sentiment                    
                    
    print(sent_dict)
    return sent_dict

def sentiment_pipe(text):
    text2 = preproc_text(text)
    review = replace_pronouns(text2)
    sentences = split_sentence(review)
    terms_dict = dict()
    for sentence in sentences:
        sentence = remove_special_char(str(sentence))
        terms_dict.update(feature_sentiment(str(sentence)))
    return terms_dict

In [95]:
text1 = """C++ is a powerful and fast language. I like it."""
text2 = """What is wrong with C++? Is it a powerful and fast language at all? I don't like it."""

In [None]:
ans1 = sentiment_pipe(text1)
ans2 = sentiment_pipe(text2)

In [97]:
ans1, ans2

({'CPlusPlus': 1, 'language': 1}, {'What': -1, 'it': -1, 'language': 1})

In [None]:
fin_df['sentiments'] = fin_df['full_text'].apply(lambda x:  sentiment_pipe(x))

In [None]:
final_sent_dict = dict()

for local_sent_dict in fin_df['sentiments']:
    final_sent_dict.update(local_sent_dict)