In [1]:
import pandas as pd
import numpy as np
from common import utils
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix, hstack
from sklearn.svm import SVC
from sklearn.metrics import f1_score
import time

#load training data
twitter_train = pd.read_excel('./StanceDataset/train.xlsx')

#load test data
twitter_test = pd.read_excel('./StanceDataset/test.xlsx')

## Implement baseline - SVM with n-gram features as per the original paper

In [2]:
twitter_train.head()

Unnamed: 0,Tweet,Target,Stance,Opinion Towards,Sentiment
0,"@tedcruz And, #HandOverTheServer she wiped cle...",Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
1,Hillary is our best choice if we truly want to...,Hillary Clinton,FAVOR,1. The tweet explicitly expresses opinion abo...,pos
2,@TheView I think our country is ready for a fe...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
3,I just gave an unhealthy amount of my hard-ear...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
4,@PortiaABoulger Thank you for adding me to you...,Hillary Clinton,NONE,3. The tweet is not explicitly expressing opi...,pos


In [3]:
twitter_test.head()

Unnamed: 0,Tweet,Target,Stance,Opinion Towards,Sentiment
0,He who exalts himself shall be humbled; a...,Atheism,AGAINST,1. The tweet explicitly expresses opinion abo...,pos
1,RT @prayerbullets: I remove Nehushtan -previou...,Atheism,AGAINST,1. The tweet explicitly expresses opinion abo...,other
2,@Brainman365 @heidtjj @BenjaminLives I have so...,Atheism,AGAINST,1. The tweet explicitly expresses opinion abo...,pos
3,#God is utterly powerless without Human interv...,Atheism,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
4,@David_Cameron Miracles of #Multiculturalism...,Atheism,AGAINST,2. The tweet does NOT expresses opinion about ...,neg


In [4]:
def svm_stance(train, test, topic_list):
    """This function takes a list of topics and returns the stance classification f1 score
    of our baseline SVM implementation over the training / test data filtered for the applicable topics"""
    
    #convert topic list to lower case
    for i in range(0,len(topic_list)):
        topic_list[i] = topic_list[i].lower()
    
    #find boolean series of all training & target rows pertaining to the relevant topics
    matches_train = train['Target'].apply(lambda x: x.lower() in topic_list)
    matches_test = test['Target'].apply(lambda x: x.lower() in topic_list)
    
    #carve out the training tweets
    tweets_train = twitter_train[matches_train]['Tweet']
    tweets_test = twitter_test[matches_test]['Tweet']

    #carve out training labels and convert to labels for feeding sklearn SVM
    stance_labels_train = np.array(twitter_train[matches_train]['Stance'].apply(lambda x: 2 if x == "FAVOR" else 
                                 (1 if x == "NONE" else 0)))

    stance_labels_test = np.array(twitter_test[matches_test]['Stance'].apply(lambda x: 2 if x == "FAVOR" else 
                                 (1 if x == "NONE" else 0)))

    #tokenize tweets so we can determine our own n-grams (by default, CountVectorizer removes things like # in hashtags)
    tokens = tweets_train.apply(lambda x: x.split())

    #loop to generate uni, bi, and trigrams and store in list
    word_ngrams = []

    #nested for loops to create the universe of word ngrams (uni - tri)
    for tweet in tokens:
        for n in range(1,4):
            for gram in ngrams(tweet,n):
                word_ngrams.append(gram)

    #turn into unique list
    word_ngrams = np.unique(np.asarray(word_ngrams)).tolist()

    #convert training data to sparse matrix for relevant n-gram words
    word_grams = CountVectorizer(ngram_range = (1,3), vocabulary = word_ngrams)
    train_nwords = word_grams.fit_transform(tweets_train)
    
    #convert test data to sparse matrix for relevant n-gram words
    test_nwords = word_grams.transform(tweets_test)
    
    #convert training data to sparse matrix for relevant n-gram chars
    char_grams = CountVectorizer(ngram_range = (2,5), analyzer = 'char')
    train_nchars = char_grams.fit_transform(tweets_train)
    
    #convert test data to sparse matrix for relevant n-gram chars
    test_nchars = char_grams.transform(tweets_test)
    
    #stack the word and char n-grams
    train_grams = hstack((train_nwords, train_nchars))
    test_grams = hstack((test_nwords, test_nchars))
    
    #turn counts into presence/absence indicators, as mentioned in the paper
    train_grams = pd.DataFrame(train_grams.todense())
    test_grams = pd.DataFrame(test_grams.todense())
    
    train_grams = train_grams.applymap(lambda x: 1 if x > 1 else x)
    test_grams = test_grams.applymap(lambda x: 1 if x > 1 else x)
    
    train_grams = csr_matrix(train_grams.values)
    test_grams = csr_matrix(test_grams.values)
       
    #train SVM
    start_time_train = time.time()
    svm = SVC(kernel = 'linear')
    svm.fit(train_grams,stance_labels_train)
    end_time_train = time.time()
    print("Training time:",end_time_train - start_time_train)

    #prediction
    start_time_predict = time.time()
    stance_predict = svm.predict(test_grams)
    end_time_predict = time.time()
    print("Prediction time:", end_time_predict - start_time_predict)

    #calculate f1 score
    f1 = f1_score(stance_labels_test, stance_predict, average = 'macro')
    
    return f1

In [5]:
#Atheism
f1_ath = svm_stance(twitter_train, twitter_test, ['Atheism'])
print("F1 score for atheism:", f1_ath)

Training time: 0.6293482780456543
Prediction time: 0.1685028076171875
F1 score for atheism: 0.5633416647731576


In [6]:
#Hillary
f1_hil = svm_stance(twitter_train, twitter_test, ['Hillary Clinton'])
print("F1 score for Hillary", f1_hil)

Training time: 1.0795819759368896
Prediction time: 0.3540041446685791
F1 score for Hillary 0.6134134134134134


In [7]:
#Abortion
f1_abort = svm_stance(twitter_train, twitter_test, ['Legalization of Abortion'])
print("F1 score for Abortion", f1_abort)

Training time: 1.072415828704834
Prediction time: 0.30336475372314453
F1 score for Abortion 0.5849022090896299


In [8]:
#Climate
f1_clim = svm_stance(twitter_train, twitter_test, ['Climate Change is a Real Concern'])
print("F1 score for climate change", f1_clim)

Training time: 0.43463802337646484
Prediction time: 0.11292505264282227
F1 score for climate change 0.4433209751713291


  'precision', 'predicted', average, warn_for)


In [9]:
#Feminism
f1_fem = svm_stance(twitter_train, twitter_test, ['Feminist Movement'])
print("F1 score for feminist mov", f1_fem)

Training time: 1.0146219730377197
Prediction time: 0.28386688232421875
F1 score for feminist mov 0.5548274770496991


In [10]:
#All
all_tops = ['Hillary Clinton', 'Legalization of Abortion', 'Climate Change is a Real Concern',
           'Feminist Movement', 'Atheism']
f1_all = svm_stance(twitter_train, twitter_test, all_tops)
print("F1 score for all topics", f1_all)

Training time: 19.378078937530518
Prediction time: 5.23293399810791
F1 score for all topics 0.5755249133826865
