In [2]:
import spacy
import re
from functools import reduce
import pandas as pd
from collections import Counter

nlp = spacy.load('en_coref_md')

In [16]:
import multiprocessing as mp

In [17]:
with open('data/programming_languages.txt') as f:
    languages = f.read().split('\n')

with open("./data/neg_words.txt", 'r', encoding = "ISO-8859-1") as f:
    neg = list(map(lambda x: x.strip(), f.readlines()))
with open("./data/pos_words.txt", 'r', encoding = "ISO-8859-1") as f:
    pos  = list(map(lambda x: x.strip(), f.readlines()))
opinion_words = neg + pos

In [7]:
fin_df = pd.read_csv('./data/df_hni_2018_05.csv', low_memory=False)

# read lists from csv
for list_col in \
    ['text_sentences', 'text_words', 'title_words', 'lengths_of_text_words', 'lengths_of_title_words']:
    fin_df[list_col] = fin_df[list_col].apply(eval)

In [8]:
big_inapropriate_string = ''
for lang_orig in languages:
    big_inapropriate_string += ''.join(re.findall(pattern=re.compile("[\W]"), string=lang_orig))

inapropriate_syms = sorted(list(set(big_inapropriate_string)))

inapropriate_syms_dict = dict(zip(inapropriate_syms, 
['Whitespace', 'Exclamation', 'Sharp',  
 'Apostrophe', 'Bracket', 'Bracket',   
 'Star', 'Plus', 'Comma', 'Hyphen', 
 'Dot', 'Slash', 'Colon', 
 'Dog',  'Dash',  'Apostrophe']))

In [9]:
inapropriate_syms_dict

{' ': 'Whitespace',
 '!': 'Exclamation',
 '#': 'Sharp',
 "'": 'Apostrophe',
 '(': 'Bracket',
 ')': 'Bracket',
 '*': 'Star',
 '+': 'Plus',
 ',': 'Comma',
 '-': 'Hyphen',
 '.': 'Dot',
 '/': 'Slash',
 ':': 'Colon',
 '@': 'Dog',
 '–': 'Dash',
 '′': 'Apostrophe'}

In [10]:
languages  = list(map(lambda x: x.strip(), languages))

lang_new = []
for lang_orig in languages:
    
    lang_new.append(
        reduce(lambda x, kv: x.replace(*kv), inapropriate_syms_dict.items(), lang_orig)
    )
    
languages_dict = dict(zip(languages, lang_new))

In [11]:
fin_df['all_words'] = fin_df['text_words']  + fin_df['title_words']

fin_df.loc[fin_df['text'].isnull(), 'text'] = ''

fin_df.loc[fin_df['title'].isnull(), 'title'] = ''

fin_df['full_text'] = fin_df['text'] + fin_df['title'] 

fin_df['full_text2'] = fin_df['full_text'].apply(lambda txt: preproc_text(txt))

In [12]:
all_score_dict = dict(zip(neg, [-1 for i in range(len(neg))]))
pos_score_dict = dict(zip(pos, [1 for i in range(len(pos))]))

all_score_dict.update(pos_score_dict)

In [17]:
def preproc_text(text):
    text = re.sub(r'<\w+>', ' ', text)
    text = reduce(lambda x, kv: x.replace(*kv), languages_dict.items(), text)
    return text

def replace_pronouns(text):
    return nlp(text)._.coref_resolved

def split_sentence(text):
    '''
    splits review into a list of sentences using nlp's sentence parser
    '''
    review = nlp(text)
    bag_sentence = []
    start = 0
    for token in review:
        if token.sent_start:
            bag_sentence.append(review[start:(token.i-1)])
            start = token.i
        if token.i == len(review)-1:
            bag_sentence.append(review[start:(token.i+1)])
    return bag_sentence

# Remove special characters using regex
def remove_special_char(sentence):
    return re.sub(r"[^a-zA-Z0-9.',:;?]+", ' ', sentence)

def feature_sentiment(sentence):
    '''
    input: dictionary and sentence
    function: appends dictionary with new features if the feature did not exist previously,
              then updates sentiment to each of the new or existing features
    output: updated dictionary
    '''

    sent_dict = Counter()
    sentence = nlp(sentence)

    for token in sentence:
        if token.text not in opinion_words:
            continue

        if (token.dep_ == "advmod"):
            continue
        
        sentiment = 1 if token.text in pos else -1
        
        if (token.dep_ == "amod"):
            
            #    'Token "{token}" dependency is AMOD, incrementing key "{head_text}" by {sentiment}'\
                
            sent_dict[token.head.text] += sentiment
            continue
        
        #'Processing token "{token}" children #1'.format(token=token.text))
        for child in token.children:
            if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                #'Child "{child}" dependency is {dep} and child is OPINION WORD'
                #'Multiplying sentiment by 1.5')
                sentiment *= 1.5
            elif child.dep_ == "neg":
                #'Child "{child}" dependency is {dep}'
                #'Multiplying sentiment by -1')
                sentiment *= -1
        
        #'Processing token "{token}" children #2'.format(token=token.text))
        for child in token.children:
            if (token.pos_ == "VERB") and (child.dep_ == "dobj"):   
                #'Token is a VERB and child "{child}" dependency is DOBJ, 
                #incrementing key "{child}" by {sentiment}'\
                   
                sent_dict[child.text] += sentiment
                
                try:
                    # check for conjugates (a AND b), then add both to dictionary
                    subchildren = []
                    conj = False

                    for subchild in child.children:
                        if subchild.text == "and":
                            conj = True

                        if conj and (subchild.text != "and"):
                            #'Found "and {sub}" conjurgate of {child}'
                            subchildren.append(subchild.text)
                            conj = False

                    for subchild in subchildren:
                        #'Incrementing key "{sub}" by {sentiment}'
                        sent_dict[subchild] += sentiment
                except:
                    pass
    
        # check for negation
        #'Processing tokens "{token}" head "{head}" children #1'.
        for child in token.head.children:
            if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                sentiment *= 1.5
            elif (child.dep_ == "neg"):
                sentiment *= -1

        # check for negation
        #'Processing tokens "{token}" head "{head}" children #2'
        #list(map(lambda x: x.text, token.head.children)))
        for child in token.head.children:
            noun = ""

            if (child.pos_ == "NOUN") and (child.text not in sent_dict):
                noun = child.text
                
                try:
                    for subchild in child.children:
                        if subchild.dep_ == "compound":
                            noun = subchild.text + " " + noun
                except:
                    pass
                
                #'Incrementing key "{noun}" by {sentiment}'
                sent_dict[noun] += sentiment                    
                    
    return sent_dict

def sentiment_pipe(text):
    text2 = preproc_text(text)
    review = replace_pronouns(text2)
    sentences = split_sentence(review)
    terms_dict = dict()
    for sentence in sentences:
        sentence = remove_special_char(str(sentence))
        terms_dict.update(feature_sentiment(str(sentence)))
    return terms_dict

In [18]:
text1 = """C++ is a powerful and fast language. I like it."""
text2 = """What is wrong with C++? Is it a powerful and fast language at all? I don't like it."""

In [19]:
ans1 = sentiment_pipe(text1)
ans2 = sentiment_pipe(text2)

In [20]:
ans1, ans2

({'CPlusPlus': 1, 'language': 1}, {'What': -1, 'it': -1, 'language': 1})

In [73]:
%%time
fin_df.loc[0:1000, 'sentiments'] = fin_df.loc[0:1000, 'full_text'].apply(lambda x: sentiment_pipe(x))

CPU times: user 4min 23s, sys: 2min 39s, total: 7min 3s
Wall time: 1min 46s


In [63]:
106 * 230  / 60 / 60  # часов ждать пока просчитается маленький df на 200К строк

5.888888888888888

In [None]:
%%time

it = [i for i in range(: fin_df.shape[0] : 1000)]

for start, end in zip(it, it[1:] + [fin_df.shape[0]]):
    fin_df.loc[start:end, 'sentiments'] = fin_df.loc[start:end, 'full_text']\
        .apply(lambda x: sentiment_pipe(x))
    fin_df.to_csv('data/sent_spacy_hacknews.csv', index=None)
    print('saved_{}:{}'.format( start, end ))
    
print('shape of my df = {}'.format(fin_df.shape[0]))

In [None]:
%%time
fin_df['sentiments'] = fin_df[ 'full_text'].apply(lambda x: sentiment_pipe(x))

In [66]:
fin_df['sentiments'].fillna( dict(), inplace=True)

In [78]:
fin_df.to_csv('data/sleepy.csv', index=None)

In [3]:
fin_df = pd.read_csv('data/sleepy.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
from collections import defaultdict

In [12]:
fin_df = pd.read_csv('data/sleepy.csv', converters={'sentiments': eval})

  interactivity=interactivity, compiler=compiler, result=result)


In [13]:
fin_df.sentiments[3]

{'issue': -1, 'thing': 1}

In [14]:
final_sent_dict = defaultdict(int)

for local_sent_dict in fin_df['sentiments']:
    for k, v in local_sent_dict.items():
        final_sent_dict[k] += v

fin_df['sentiments'] = fin_df['full_text'].apply(lambda x:  sentiment_pipe(x))

In [18]:
langs_sent_compound = dict()
langs_sent_compound_orig_big = defaultdict(dict)
langs_sent_orig = defaultdict(int)

for lang in languages:
    for k, v in final_sent_dict.items():
        if (lang in k):
            langs_sent_compound[k] = v
            langs_sent_compound_orig_big[lang].update({k : v})
            langs_sent_orig[lang] += v

In [19]:
langs_sent_compound_orig_big['TypeScript']

{'TypeScript': 3,
 'TypeScript AngularJS NodeJS HTML': 1,
 'TypeScript converter': 1.5,
 'TypeScript support': 2}

In [23]:
languages

['A# .NET',
 'A# ',
 'Axiom',
 'A-0 System',
 'A+',
 'A++',
 'ABAP',
 'ABC',
 'ABC ALGOL',
 'ABSET',
 'ABSYS',
 'ACC',
 'Accent',
 'Ace DASL ',
 'Distributed Application Specification Language',
 'ACL2',
 'ACT-III',
 'Action!',
 'ActionScript',
 'Actor',
 'Ada',
 'Adenine',
 'Agda',
 'Agilent VEE',
 'Agora',
 'AIMMS',
 'Aldor',
 'Alef',
 'ALF',
 'ALGOL 58',
 'ALGOL 60',
 'ALGOL 68',
 'ALGOL W',
 'Alice',
 'Alma-0',
 'AmbientTalk',
 'Amiga E',
 'AMOS',
 'AMPL',
 'AngelScript',
 'Apex',
 'APL',
 "App Inventor for Android's visual block language",
 'AppleScript',
 'APT',
 'Arc',
 'ARexx',
 'Argus',
 'AspectJ',
 'Assembly language',
 'ATS',
 'Ateji PX',
 'AutoHotkey',
 'Autocoder',
 'AutoIt',
 'AutoLISP',
 'Visual LISP',
 'Averest',
 'AWK',
 'Axum',
 'Active Server Pages',
 'B',
 'Babbage',
 'Ballerina',
 'Bash',
 'BASIC',
 'bc',
 'BCPL',
 'BeanShell',
 'Batch file',
 'Bertrand',
 'BETA',
 'BLISS',
 'Blockly',
 'BlooP',
 'Boo',
 'Boomerang',
 'Bourne shell ',
 'bash',
 'ksh',
 'BPEL',
 'Bu

In [24]:
langs_sent_compound_orig_big['bash']

{'advantage bash': -1,
 'args bash': -1,
 'bash': -165.5,
 'bash problem': -1,
 'bash problems': -1,
 'bash prompt': 1,
 'bashing': -6,
 'bin bash': -15,
 'curl bash': -1,
 'env bash': -2,
 'env usr bash': 0,
 'man bash': -1,
 'pit bash': -1,
 'sudo bash': -1,
 'sudo curl bash': -1}

In [93]:
final_sent_dict['CPlusPlus']

17.5

In [34]:
final_sent_dict['zsh']

-1

In [32]:
pd.options.display.max_colwidth=3000

In [35]:
fin_df[fin_df['sentiments'].apply(lambda x: 'zsh' in x.keys())][['full_text', 'sentiments']]

Unnamed: 0,full_text,sentiments
108778,"I think it's time people start using something better than bash/zsh that is decades old, like fish or even come up with a more modern shell.<p>Even by looking at these examples, you see it has less verbosity like ""then"" and ""do"", you can reference arguments as $argv instead of cryptic $@ and exit status code as $status instead of $? which is confusing with $! and the likes.<p><a href=""https://blog.codeship.com/lets-talk-about-shell-scripting/"" rel=""nofollow"">https://blog.codeship.com/lets-talk-about-shell-scripting/</a><p><a href=""https://fishshell.com/docs/current/tutorial.html"" rel=""nofollow"">https://fishshell.com/docs/current/tutorial.html</a><p>Shell is such an integral part of admins and programmers workflow yet I find it hard to believe this field has been so slow at improving. Even the fish site jokingly states ""Finally, a command\nline shell for the 90s"" implying the others are even older.","{'something': 1, 'zsh': -1, 'likes': -1, 'arguments': -1, 'part': 1, 'decades': 1, 'field': -1, 'shell': 1}"


In [92]:
languages

['A# .NET',
 'A#',
 'Axiom',
 'A-0 System',
 'A+',
 'A++',
 'ABAP',
 'ABC',
 'ABC ALGOL',
 'ABSET',
 'ABSYS',
 'ACC',
 'Accent',
 'Ace DASL',
 'Distributed Application Specification Language',
 'ACL2',
 'ACT-III',
 'Action!',
 'ActionScript',
 'Actor',
 'Ada',
 'Adenine',
 'Agda',
 'Agilent VEE',
 'Agora',
 'AIMMS',
 'Aldor',
 'Alef',
 'ALF',
 'ALGOL 58',
 'ALGOL 60',
 'ALGOL 68',
 'ALGOL W',
 'Alice',
 'Alma-0',
 'AmbientTalk',
 'Amiga E',
 'AMOS',
 'AMPL',
 'AngelScript',
 'Apex',
 'APL',
 "App Inventor for Android's visual block language",
 'AppleScript',
 'APT',
 'Arc',
 'ARexx',
 'Argus',
 'AspectJ',
 'Assembly language',
 'ATS',
 'Ateji PX',
 'AutoHotkey',
 'Autocoder',
 'AutoIt',
 'AutoLISP',
 'Visual LISP',
 'Averest',
 'AWK',
 'Axum',
 'Active Server Pages',
 'B',
 'Babbage',
 'Ballerina',
 'Bash',
 'BASIC',
 'bc',
 'BCPL',
 'BeanShell',
 'Batch file',
 'Bertrand',
 'BETA',
 'BLISS',
 'Blockly',
 'BlooP',
 'Boo',
 'Boomerang',
 'Bourne shell',
 'bash',
 'ksh',
 'BPEL',
 'Busin

In [87]:
fixed_ks = sorted(langs_sent_orig.keys())
fixed_vs = [langs_sent_orig[k] for k in fixed_ks]
pd.DataFrame({'language':fixed_ks, 'sentiment':fixed_vs})

Unnamed: 0,language,sentiment
0,ABC,-1.0000
1,APL,6.0000
2,APT,-1.0000
3,AWK,2.0000
4,Accent,-1.0000
5,Ada,0.0000
6,Alice,-1.5000
7,Arc,2.0000
8,B,89.0000
9,BASIC,3.0000
