In [1]:
import sys
import os
import pandas as pd
import argparse
import re
import ipdb

from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, accuracy_score as acc, precision_score as prec, recall_score as rec
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import check_X_y, check_array
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.preprocessing import normalize, binarize, LabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier

import matplotlib.pyplot as plt 

from nltk.tokenize import TweetTokenizer

tweet_tokenizer = TweetTokenizer()

import six
from abc import ABCMeta
import numpy as np
from scipy import sparse
from scipy.sparse import issparse

In [2]:
class NBSVM(six.with_metaclass(ABCMeta, BaseEstimator, ClassifierMixin)):

    def __init__(self, alpha=1.0, C=1.0, max_iter=10000):
        self.alpha = alpha
        self.max_iter = max_iter
        self.C = C
        self.svm_ = [] # fuggly

    def fit(self, X, y):
        X, y = check_X_y(X, y, 'csr')
        _, n_features = X.shape

        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y)
        self.classes_ = labelbin.classes_
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        # LabelBinarizer().fit_transform() returns arrays with dtype=np.int64.
        # so we don't have to cast X to floating point
        Y = Y.astype(np.float64)

        # Count raw events from data
        n_effective_classes = Y.shape[1]
        self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
        self.ratios_ = np.full((n_effective_classes, n_features), self.alpha,
                                 dtype=np.float64)
        self._compute_ratios(X, Y)

        # Fit SVM for each class
        for i in range(n_effective_classes):
            X_i = X.multiply(self.ratios_[i])
            svm = LinearSVC(C=self.C, max_iter=self.max_iter)
            Y_i = Y[:,i]
            svm.fit(X_i, Y_i)
            self.svm_.append(svm) 

        return self
    
    def predict_proba(self, X):
        n_effective_classes = self.class_count_.shape[0]
        n_examples = X.shape[0]

        D = np.zeros((n_effective_classes, n_examples))

        for i in range(n_effective_classes):
            X_i = X.multiply(self.ratios_[i])
            D[i] = self.svm_[i].decision_function(X_i)
        import ipdb;ipdb.set_trace()
        return D

    def predict(self, X):
        n_effective_classes = self.class_count_.shape[0]
        n_examples = X.shape[0]

        D = np.zeros((n_effective_classes, n_examples))

        for i in range(n_effective_classes):
            X_i = X.multiply(self.ratios_[i])
            D[i] = self.svm_[i].decision_function(X_i)
        
        return self.classes_[np.argmax(D, axis=0)]
        
    def _compute_ratios(self, X, Y):
        """Count feature occurrences and compute ratios."""
        if np.any((X.data if issparse(X) else X) < 0):
            raise ValueError("Input X must be non-negative")
        self.ratios_ += safe_sparse_dot(Y.T, X)  # ratio + feature_occurrance_c
        normalize(self.ratios_, norm='l1', axis=1, copy=False)
        row_calc = lambda r: np.log(np.divide(r, (1 - r)))
        self.ratios_ = np.apply_along_axis(row_calc, axis=1, arr=self.ratios_)
        check_array(self.ratios_)
        self.ratios_ = sparse.csr_matrix(self.ratios_)

        #p_c /= np.linalg.norm(p_c, ord=1)
        #ratios[c] = np.log(p_c / (1 - p_c))

In [3]:
def replace_ent(tweet, ent):
    'Find entity name in tweet and replace with Doe' 
    
    pattern = re.compile(r"\@" + ent, re.IGNORECASE)
    return re.sub(pattern, "@USER", tweet)

df = pd.read_csv('../data-annotation/maj_df_split.tsv', sep='\t')
df['tweet_clean'] = df.apply(lambda x: replace_ent(tweet=x['tweet'], ent=x['mentname']), axis=1)
df['tweet_clean'] = df['tweet_clean'].apply(lambda x: " ".join(tweet_tokenizer.tokenize(x)))
df['group'] = df['group'].apply(lambda x: 0 if x==-1 else 1)

In [4]:
df

Unnamed: 0,TweetId,tweet,username,mentname,Date,Predom,Feeling,Behavior,Admiration,Admiration_Int,...,Joy,Joy_Int,Sadness,Sadness_Int,Surprise,Surprise_Int,group,party,Split,tweet_clean
0,1001885064974790657,Admire @OfficialCBC Chairman @reprichmond's mo...,reprokhanna,reprichmond,2018-05-30,predom,warm,app,True,3.000000,...,False,2.000000,False,2.0,False,2.0,1,D,train,Admire @OfficialCBC Chairman @USER ' s moral v...
1,1002970603165536258,It was wonderful to celebrate the 150th annive...,senatorcantwell,pattymurray,2018-06-02,predom,warm,app,False,2.000000,...,True,2.333333,False,2.0,False,2.0,1,D,test,It was wonderful to celebrate the 150th annive...
2,1003713413292462082,I’m proud to support legislation Senator @timk...,pattymurray,timkaine,2018-06-04,predom,warm,app,False,2.333333,...,True,2.333333,False,2.0,False,2.0,1,D,train,I ’ m proud to support legislation Senator @US...
3,1004419010333691904,"During #ImmigrantHeritageMonth, let’s remember...",repdelbene,speakerryan,2018-06-06,predom,cold,disapp,False,2.000000,...,False,2.000000,False,2.0,False,2.0,0,D,train,"During #ImmigrantHeritageMonth , let ’ s remem..."
4,1004433631950065664,Thank you @senatorleahy. The Administration is...,senbobcasey,senatorleahy,2018-06-06,predom,warm,app,True,2.000000,...,True,1.666667,False,2.0,False,2.0,1,D,train,Thank you @USER . The Administration is choosi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3028,999245047202811904,.@stevekingia should keep his eyes and “ears” ...,repspeier,stevekingia,2018-05-23,predom,cold,disapp,False,2.000000,...,False,2.000000,False,2.0,False,2.0,0,D,train,. @USER should keep his eyes and “ ears ” on t...
3029,999289718683721728,"We’ve lost 630,000 Americans to overdoses sinc...",speakerryan,gopleader,2018-05-23,predom,neutral,dunno,False,2.000000,...,False,2.000000,False,2.0,False,2.0,1,R,train,"We ’ ve lost 630,000 Americans to overdoses si..."
3030,999329645316247552,The House approved an amendment to the Nationa...,kencalvert,gracenapolitano,2018-05-23,predom,warm,app,False,2.000000,...,False,2.000000,False,2.0,False,2.0,0,R,train,The House approved an amendment to the Nationa...
3031,999381351634743299,Led a letter to President Trump with @mikekell...,senatorenzi,mikekellypa,2018-05-23,predom,warm,app,False,2.000000,...,False,1.666667,False,2.0,False,2.0,1,R,test,Led a letter to President Trump with @USER and...


In [5]:
tweets = df['tweet_clean'].values
groups = df['group'].values

x_train = df[(df['Split']=='train')].loc[:, 'tweet_clean'].values
y_train = df[(df['Split']=='train')].loc[:, 'group'].values

x_dev = df[(df['Split']=='dev')].loc[:, 'tweet_clean'].values
y_dev = df[(df['Split']=='dev')].loc[:, 'group'].values

x_test = df[(df['Split']=='test')].loc[:, 'tweet_clean'].values
y_test = df[(df['Split']=='test')].loc[:, 'group'].values


# x_train, x_test, y_train, y_test = train_test_split(tweets, groups, test_size=0.2, random_state=1) 

CountVectorizer makes a dictionary of counts of all unigrams and bigrams in the data.

NB SVM makes features of these counts by converting them to ratios - how many times does this unigram/bigram occur in in-group versus out-group settings? Log ratio

In [6]:
vect = CountVectorizer()
classifier = NBSVM()

In [7]:
clf = Pipeline([('vect', vect), ('nbsvm', classifier)])
ngram=(1,2)    #only unigrams and bigrams
params = {'vect__token_pattern': r"\S+", 'vect__ngram_range': ngram,  'vect__binary': True}
clf.set_params(**params)

Pipeline(steps=[('vect',
                 CountVectorizer(binary=True, ngram_range=(1, 2),
                                 token_pattern='\\S+')),
                ('nbsvm', NBSVM())])

In [8]:
clf.fit(x_train, y_train)



Pipeline(steps=[('vect',
                 CountVectorizer(binary=True, ngram_range=(1, 2),
                                 token_pattern='\\S+')),
                ('nbsvm', NBSVM())])

In [17]:
pred = clf.predict(x_dev)
print("F1 Score: ", f1_score(y_dev, pred, average='micro'))

F1 Score:  0.6246153846153846


In [10]:
pred = clf.predict(x_test)
print("F1 Score: ", np.round(f1_score(y_test, pred, average='micro'),3))

F1 Score:  0.625


In [14]:
def top_coefficients(coef, feature_names, top_features=25):
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    feature_names = np.array(feature_names)
    
    print("Top out-group features:")
    display(pd.DataFrame(feature_names[top_positive_coefficients][::-1]))
    
    print("Top in-group features:")
    display(pd.DataFrame(feature_names[top_negative_coefficients]))

In [15]:
coefs_0 = clf['nbsvm'].svm_[0].coef_.ravel()
coefs_1 = clf['nbsvm'].svm_[1].coef_.ravel()
feature_names = clf['vect'].get_feature_names_out()

Top features

In [16]:
top_coefficients(coef=coefs_1, feature_names=feature_names)

Top out-group features:


Unnamed: 0,0
0,wishing a
1,thanks @user
2,@user !
3,bipartisan
4,", senator"
5,wishing
6,"process ,"
7,birthday rep
8,. wishing
9,senator @user


Top in-group features:


Unnamed: 0,0
0,have a
1,wishing you
2,"thanks ,"
3,thoughts on
4,@user a
5,love @user
6,@user
7,"you ,"
8,#groundhogday
9,#groundhogday #hbd
