# Bake-off: The semantic orientation method

__Important__: This isn't being run as a bake-off this year. It's included in the repository in case people want to do additional exploration or incorporate this kind of evaluation into a project.

In [4]:
__author__ = "Morgan Bryant orig. Christopher Potts"
__version__ = "CS224u, Stanford, Spring 2018 term"

In [1]:
from collections import defaultdict
import csv
import importlib
import numpy as np
import os
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import vsm

In [2]:
data_home = 'vsmdata'

## Implementation

In [3]:
def semantic_orientation(
        df,        
        seeds1=default_seeds1,
        seeds2=default_seeds2,
        distfunc=vsm.cosine):    
    """No frills implementation of the semantic Orientation (SO) method of 
    Turney and Littman. `seeds1` and `seeds2` should be representative members 
    of two intutively opposing semantic classes. The method will then try 
    to rank the vocabulary by its relative association with each seed set.
        
    Parameters
    ----------
    df : pd.DataFrame
        The matrix used to derive the SO ranking.           
    seeds1 : tuple of str
        The default is the negative seed set of Turney and Littman.        
    seeds2 : tuple of str
        The default is the positive seed set of Turney and Littman.        
    distfunc : function mapping vector pairs to floats (default: `cosine`)
        The measure of distance between vectors. Can also be `euclidean`, 
        `matching`, `jaccard`, as well as any other distance measure 
        between 1d vectors. 
    
    Returns
    -------    
    pd.Series
        The vocabulary ranked according to the SO method, with words 
        closest to `seeds1` at the top and words closest to `seeds2` at the 
        bottom.
    
    """
    rownames = set(df.index)
    # Check that the seed sets are in the vocabulary, filtering
    # where necessary, and warn the user about exclusions:
    seeds1 = _value_check(seeds1, "seeds1", rownames)
    seeds2 = _value_check(seeds2, "seeds2", rownames)
    
    # Subframes for the two seeds-sets
    sm1 = df.loc[seeds1]
    sm2 = df.loc[seeds2]
    
    # Core semantic orientation calculation:
    def row_func(row):
        val1 = sm1.apply(lambda x: distfunc(row, x), axis=1).sum()
        val2 = sm2.apply(lambda x: distfunc(row, x), axis=1).sum()
        return val1 - val2
    
    scores = df.apply(row_func, axis=1)
    return scores.sort_values(ascending=False)

def _value_check(ss, name, rownames):
    new = set()
    for w in ss:
        if w not in rownames:
            print("Warning: {} not in {}".format(w, name))
        else:
            new.add(w)
    return new

In [4]:
imdb20 = pd.read_csv(
    os.path.join(data_home, 'imdb_window20-flat.csv.gz'), index_col=0)

In [5]:
imdb20_ppmi = vsm.pmi(imdb20)

In [6]:
imdb20_ppmi_so = semantic_orientation(imdb20_ppmi)



In [7]:
imdb20_ppmi_so.head()

excellent    0.596622
superb       0.249968
great        0.247541
superior     0.230842
nice         0.189436
dtype: float64

In [8]:
imdb20_ppmi_so.tail()

unfortunate   -1.694198
nasty         -1.838113
poor          -1.907216
wrong         -1.929000
bad           -1.954924
dtype: float64

## Multidimensional sentiment lexicon

[Warriner et al. (20130](http://www.humanities.mcmaster.ca/~vickup/Warriner-etal-BRM-2013.pdf) released a dataset called 'Norms of valence, arousal, and dominance for 13,915 English lemmas'. This is included in `vsmdata` as `Ratings_Warriner_et_al.csv`. The following code reads this file in and creates a DataFrame that gives just the overall means for these three semantic dimensions.

In [9]:
def load_warriner_lexicon(src_filename, df=None):
    """Read in 'Ratings_Warriner_et_al.csv' and optionally restrict its 
    vocabulary to items in `df`.
    
    Parameters
    ----------
    src_filename : str
        Full path to 'Ratings_Warriner_et_al.csv'
    df : pd.DataFrame or None
        If this is given, then its index is intersected with the 
        vocabulary from the lexicon, and we return a lexicon 
        containing only values in both vocabularies.
        
    Returns
    -------
    pd.DataFrame
    
    """
    lexicon = pd.read_csv(src_filename, index_col=0)
    lexicon = lexicon[['Word', 'V.Mean.Sum', 'A.Mean.Sum', 'D.Mean.Sum']]
    lexicon = lexicon.set_index('Word').rename(
        columns={'V.Mean.Sum': 'Valence', 
                 'A.Mean.Sum': 'Arousal', 
                 'D.Mean.Sum': 'Dominance'})
    if df is not None:
        shared_vocab = sorted(set(lexicon.index) & set(df.index))
        lexicon = lexicon.loc[shared_vocab]
    return lexicon

def evaluation(lexicon, so, colname='Valence', metric=pearsonr):
    lexicon['so'] = so
    rho, pvalue = metric(lexicon['so'], lexicon[colname])
    print("{0:}'s r: {1:0.3f}".format(metric.__name__, rho))

In [10]:
''' Load data '''
this_dir = os.getcwd()
data_home = 'vsmdata'
imdb20 = pd.read_csv(
    os.path.join(data_home, 'imdb_window20-flat.csv.gz'), index_col=0)
imdb5 = pd.read_csv(
    os.path.join(data_home, 'imdb_window5-scaled.csv.gz'), index_col=0)
print(imdb20.shape, imdb5.shape)
# gigaword 5
# gigaword 20

lexicon = load_warriner_lexicon(
    os.path.join(data_home, 'Ratings_Warriner_et_al.csv'),
    imdb20) # imdb20 has same shape as imdb5

In [14]:
#ABC.apply(vsm.length_norm, axis=1)
imdb20_tmp = imdb20.copy()
imdb20_tmp.apply(vsm.length_norm, axis=1)

#imdb20_tmp.apply(vsm.length_norm, axis=0)
#print(imdb20_tmp.mean().tail(), imdb20.mean().tail())

Unnamed: 0_level_0,Valence,Arousal,Dominance
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TV,5.42,4.29,6.23
ability,7.0,4.85,6.55
able,6.64,3.38,6.17
abortion,2.58,5.43,4.73
absolute,5.43,3.48,5.58


In [15]:
imdb20_n0 = imdb20.apply(vsm.length_norm, axis=1)
imdb20_n01 = imdb20_n0.apply(vsm.length_norm, axis=0)
imdb20_n01_ppmi = vsm.pmi(imdb20_n01)
imdb20_n01_ppmi = imdb20_n01_ppmi.apply(vsm.length_norm, axis=1)
imdb20_n01_ppmi = imdb20_n01_ppmi.apply(vsm.length_norm, axis=0)

In [175]:
''' Morgan Bryant: this is a small bootstrapping algorithm
that seeks to expand the given default seeds (considered 
"paragons") by operating on a differentially modified dataset.'''

import bakeoff_semantic_orientation as bso
importlib.reload(bso)

# Operate on the seed sets
laplace = 1.0
nsteps = 3
nadditions = 4
def dist_factor(step):
    return 1.0
    #return 1+nsteps-step

#DF = imdb20_n01_ppmi
df = vsm.pmi(imdb5)
ss1_counts, ss1 = bso.bootstrap(default_seeds1, df, 'p', 
                      dist_factor, laplace, nsteps, nadditions)
ss2_counts, ss2 = bso.bootstrap(default_seeds2, df, 'n',
                      dist_factor, laplace, nsteps, nadditions)
        
        

orig set: (('bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior'),)
orig ssd: defaultdict(<function bootstrap.<locals>.<lambda> at 0x130110510>, {'bad': 1.0, 'nasty': 1.0, 'poor': 1.0, 'negative': 1.0, 'unfortunate': 1.0, 'wrong': 1.0, 'inferior': 1.0})
depth 0 on ['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior'] :
current set size: p 0 25
depth 1 on ['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior', 'textile', 'worm', 'unpleasant', "what's", "don't", 'terribly', 'reviews', 'comments', 'positive', 'bratislava', 'excuse', 'liechtenstein', 'scouting', 'events', 'fortunate', 'guys', 'carnivore', 'seed'] :
current set size: p 1 69
depth 2 on ['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior', 'textile', 'worm', 'unpleasant', "what's", "don't", 'terribly', 'reviews', 'comments', 'positive', 'bratislava', 'excuse', 'liechtenstein', 'scouting', 'events', 'fortunate', 'guys', 'carnivore', 'seed', 'hole', 'pug',

In [16]:
print(ss1,'\n','\n')
print(ss2)

dict_keys(['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior', 'textile', 'worm', 'unpleasant', "what's", "don't", 'terribly', 'reviews', 'comments', 'positive', 'bratislava', 'excuse', 'liechtenstein', 'scouting', 'events', 'fortunate', 'guys', 'carnivore', 'seed', 'hole', 'pug', 'giant', 'slovakia', 'warsaw', 'boyfriend', 'simon', 'italian', 'whilst', 'melons', 'talent', 'mammal', 'awesome', 'reptiles', 'dated', 'disappointed', 'boring', 'enough', 'catch', 'screening', 'thoroughly', 'disgusting', 'depressing', 'actual', 'mogadishu', 'based', 'mill', 'fabric', 'garment', 'happening', 'cooking', 'sunflower', 'dandelion', 'onions', 'posted', 'feedback', 'hey', 'dolls', 'milwaukee', 'bother', 'know', 'normally', 'pathetic', 'sorry', 'pears', 'melon', 'pineapples', 'lucky', 'laredo', 'belgrade', "didn't", 'expect', 'plano', 'footage', 'run', 'steel', 'pigeon', 'rabbit', 'enjoyed', 'enjoyable', 'entertained', 'petals', 'favourite', 'salad', 'girlfriend', 'her', 'insects

In [17]:
imdb20_ppmi_so = semantic_orientation(imdb20_n01_ppmi, ss1, ss2)



In [18]:
evaluation(lexicon, imdb20_ppmi_so, colname='Valence')
evaluation(lexicon, imdb20_ppmi_so, colname='Arousal')
evaluation(lexicon, imdb20_ppmi_so, colname='Dominance')

pearsonr's r: 0.186
pearsonr's r: 0.026
pearsonr's r: 0.169


In [19]:
os.getcwd()

'/Users/morganbryant/Desktop/stanford/CS/v_224u/cs224u'

## Bake-off submission

1. The name of the count matrix you started with (must be one in `vsmdata`).
1. The seed-sets you used.
1. A description of the steps you took to create your bake-off VSM – must be different from the above baseline.
1. Your Pearson r values for 'Valence', 'Arousal', and 'Dominance'.