In [1]:
import pandas as pd
import numpy as np
from common import utils
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix, hstack
from sklearn.svm import SVC
from sklearn.metrics import f1_score
import time
import re

#load training data
twitter_train = pd.read_excel('./StanceDataset/train.xlsx')

#load test data
twitter_test = pd.read_excel('./StanceDataset/test.xlsx')

## Implement baseline - SVM with n-gram features as per the original paper

In [2]:
twitter_train.head()

Unnamed: 0,Tweet,Target,Stance,Opinion Towards,Sentiment
0,"@tedcruz And, #HandOverTheServer she wiped cle...",Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
1,Hillary is our best choice if we truly want to...,Hillary Clinton,FAVOR,1. The tweet explicitly expresses opinion abo...,pos
2,@TheView I think our country is ready for a fe...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
3,I just gave an unhealthy amount of my hard-ear...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
4,@PortiaABoulger Thank you for adding me to you...,Hillary Clinton,NONE,3. The tweet is not explicitly expressing opi...,pos


In [3]:
twitter_test.head()

Unnamed: 0,Tweet,Target,Stance,Opinion Towards,Sentiment
0,He who exalts himself shall be humbled; a...,Atheism,AGAINST,1. The tweet explicitly expresses opinion abo...,pos
1,RT @prayerbullets: I remove Nehushtan -previou...,Atheism,AGAINST,1. The tweet explicitly expresses opinion abo...,other
2,@Brainman365 @heidtjj @BenjaminLives I have so...,Atheism,AGAINST,1. The tweet explicitly expresses opinion abo...,pos
3,#God is utterly powerless without Human interv...,Atheism,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
4,@David_Cameron Miracles of #Multiculturalism...,Atheism,AGAINST,2. The tweet does NOT expresses opinion about ...,neg


In [4]:
def preprocess_tweets(x):
    """Create preprocessor to retain @, !, #, ? when leveraging CountVectorizer to create word n-grams;
    These characters are likely important to capturing stance given nature of a tweet"""
    
    tokens = x.replace("@","ATSIGN").replace("!","XPT").replace("#","HASHTAG").replace("?","QUESTM")
    tokens = re.sub("\d+", "D", tokens)
    return tokens

In [5]:
def svm_stance(train, test, topic_list):
    """This function takes a list of topics and returns the stance classification f1 score
    of our baseline SVM implementation over the training / test data filtered for the applicable topics"""
    
    #convert topic list to lower case
    for i in range(0,len(topic_list)):
        topic_list[i] = topic_list[i].lower()
    
    #find boolean series of all rows pertaining to the relevant topics
    matches_train = train['Target'].apply(lambda x: x.lower() in topic_list)
    matches_test = test['Target'].apply(lambda x: x.lower() in topic_list)
    
    #carve out tweets
    tweets_train = twitter_train[matches_train]['Tweet']
    tweets_test = twitter_test[matches_test]['Tweet']

    #convert stance to labels for feeding sklearn SVM
    stance_labels_train = np.array(twitter_train[matches_train]['Stance'].apply(lambda x: 
                                                                                2 if x == "FAVOR" else 
                                                                                (1 if x == "NONE" else 0)))

    stance_labels_test = np.array(twitter_test[matches_test]['Stance'].apply(lambda x: 
                                                                             2 if x == "FAVOR" else 
                                                                             (1 if x == "NONE" else 0)))

    
    #pass tweets through pre-processor before feeding to CountVectorizer for word n-gram features
    preprocess_train = tweets_train.apply(lambda x: preprocess_tweets(x))
    preprocess_test = tweets_test.apply(lambda x: preprocess_tweets(x))
    
    #convert training data to sparse matrix for n-gram words
    word_grams = CountVectorizer(ngram_range = (1,3))
    train_nwords = word_grams.fit_transform(preprocess_train)
    
    #convert test data to sparse matrix for n-gram words
    test_nwords = word_grams.transform(preprocess_test)
    
    #convert training data to sparse matrix for n-gram chars
    char_grams = CountVectorizer(ngram_range = (2,5), analyzer = 'char')
    train_nchars = char_grams.fit_transform(tweets_train)
    
    #convert test data to sparse matrix for n-gram chars
    test_nchars = char_grams.transform(tweets_test)
    
    #stack the word and char n-grams
    train_grams = hstack((train_nwords, train_nchars))
    test_grams = hstack((test_nwords, test_nchars))
    
    ##<<WE MAY WANT TO CONSIDER LEAVING OUT THE BELOW COMMENTED OUT CODE IT REALLY SLOWS IT DOWN
    ##F1 SCORES ARE ALSO HIGHER WHEN WE DO NOT LEAVE IT IN THERE>>
    
    #turn counts into presence/absence indicators, as mentioned in the paper
    #train_grams = pd.DataFrame(train_grams.todense())
    #test_grams = pd.DataFrame(test_grams.todense())
    
    #train_grams = train_grams.applymap(lambda x: 1 if x > 1 else x)
    #test_grams = test_grams.applymap(lambda x: 1 if x > 1 else x)
    
    #train_grams = csr_matrix(train_grams.values)
    #test_grams = csr_matrix(test_grams.values)
       
    #train SVM
    start_time_train = time.time()
    svm = SVC(kernel = 'linear')
    svm.fit(train_grams,stance_labels_train)
    end_time_train = time.time()
    print("Training time:",end_time_train - start_time_train)

    #prediction
    start_time_predict = time.time()
    stance_predict = svm.predict(test_grams)
    end_time_predict = time.time()
    print("Prediction time:", end_time_predict - start_time_predict)

    #calculate f1 score
    f1 = f1_score(stance_labels_test, stance_predict, average = 'macro')
    
    return f1

In [6]:
#Atheism
f1_ath = svm_stance(twitter_train, twitter_test, ['Atheism'])
print("F1 score for atheism:", f1_ath)

Training time: 0.722930908203125
Prediction time: 0.18128275871276855
F1 score for atheism: 0.5680704546253422


In [7]:
#Hillary
f1_hil = svm_stance(twitter_train, twitter_test, ['Hillary Clinton'])
print("F1 score for Hillary", f1_hil)

Training time: 1.180267095565796
Prediction time: 0.3243858814239502
F1 score for Hillary 0.6178715314400302


In [8]:
#Abortion
f1_abort = svm_stance(twitter_train, twitter_test, ['Legalization of Abortion'])
print("F1 score for Abortion", f1_abort)

Training time: 1.0263760089874268
Prediction time: 0.29291200637817383
F1 score for Abortion 0.574521535156262


In [9]:
#Climate
f1_clim = svm_stance(twitter_train, twitter_test, ['Climate Change is a Real Concern'])
print("F1 score for climate change", f1_clim)

Training time: 0.39832425117492676
Prediction time: 0.10786604881286621
F1 score for climate change 0.43827024647887325


  'precision', 'predicted', average, warn_for)


In [10]:
#Feminism
f1_fem = svm_stance(twitter_train, twitter_test, ['Feminist Movement'])
print("F1 score for feminist mov", f1_fem)

Training time: 1.085402250289917
Prediction time: 0.3067920207977295
F1 score for feminist mov 0.5631143653518248


In [11]:
#All
all_tops = ['Hillary Clinton', 'Legalization of Abortion', 'Climate Change is a Real Concern',
           'Feminist Movement', 'Atheism']
f1_all = svm_stance(twitter_train, twitter_test, all_tops)
print("F1 score for all topics", f1_all)

Training time: 20.10564875602722
Prediction time: 5.751494884490967
F1 score for all topics 0.5911640632894003
