In [3]:
import pandas as pd
import numpy as np
import pickle

import gensim
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from nltk.tokenize import sent_tokenize, word_tokenize 

from sklearn.feature_extraction.text import TfidfVectorizer 

Idea: Match claims in reason dataset with IBM CoPA's by topic, then compare cosine similarities using avg W2V embeddings. Map topic to CoPA subject to embeddings, and one df to keep track of each topic in Reason.

In [4]:
pickle_copadict = open("./principle_argument_CoPA/PA_dict.pkl","rb")
principle_args = pickle.load(pickle_copadict)
print(principle_args)

{'Adolescent Rights': ['Many adolescents cannot make responsible decisions', 'Adolescents are as capable as adults'], 'Animal Rights': ['Animals should not be treated as property', 'There is nothing wrong with using animals to further human interests'], 'Big government': ['Public utility is best served by actions coordinated by central government', 'Public interest is best served and propelled by voluntary interactions,and not ones dictated by government'], 'Black market': ['Prohibiting products and activities makes them less visible and available, and thus less harmful', 'Prohibition is counterproductive and only leads to increased demand'], 'Clean energy': ['Humanity must embrace clean energy in order to fight climate change', 'Ecological concerns add further strain on the economy'], 'Coercion': ['A decisive and enforced policy is the best way to deliver a message', 'Enforcement tends to be less effective than persuasion and education'], 'Conservatism': ['The current system is workin

In [5]:
copa_df = pd.read_csv("./principle_argument_CoPA/IBM_Debater_(R)_CoPA-Motion-ACL-2019.v0.csv")
copa_df

Unnamed: 0,Action,Topic,Wikipedia title,Motion phrasing,Initial list?,# CoPAs,Adolescent rights,Animal rights,Big government,Black market,...,Right to privacy,Self-determination,Sexual morality,Subsidies,Technology,Tradition,Value of science,Virtual life,Wealth distribution,Welfare state
0,mandatory,Abstinence-only sex education,Abstinence-only sex education,Abstinence-only sex education should be mandatory,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0
1,end,affirmative action,Affirmative action,We should end affirmative action,1.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
2,increase,airport racial profiling in the United States,Airport racial profiling in the United States,We should increase airport racial profiling in...,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
3,ban,alcoholic beverages,Alcoholic drink,We should ban alcoholic beverages,1.0,4.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
4,abolish,the American Jobs Act,American Jobs Act,We should abolish the American Jobs Act,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,,,,,,,,,,,...,,,,,,,,,,
700,,,,,,,,,,,...,,,,,,,,,,
701,,,,,,,,,,,...,,,,,,,,,,
702,,,,,,,,,,,...,,,,,,,,,,


In [6]:
import pathlib
pathlib.Path().absolute()

PosixPath('/Users/joshy/cs577/577project-gitrepo/data')

In [7]:
wv_from_bin = KeyedVectors.load_word2vec_format("./word2vec/GoogleNews-vectors-negative300.bin",limit=50000, binary=True)

In [8]:
reasons_df = pd.read_csv("./reason-dataset.csv")
reasons_df

Unnamed: 0,id,topic,stance,substance,source,line
0,0,abortion,Con,c-adopt,Q34,And if it is not possible for your to have a b...
1,1,abortion,Con,c-kill,Q34,I believe that abortion cannot be justified be...
2,2,abortion,Con,c-baby_right,Q34,"Moreover, United Nations declaration says chil..."
3,3,abortion,Con,c-baby_right,M25,"Even if it doesn't have a brain, my belief is ..."
4,4,abortion,Con,c-sex,O43,"Can't you use a condom while having sex. Yes, ..."
...,...,...,...,...,...,...
2843,2843,obama,Pro,p-quality,B64,"While I wish he would have accomplished more, ..."
2844,2844,obama,Pro,p-republicans,B64,Obama inherited many of the deficiencies lefto...
2845,2845,obama,Con,c-War,F38,He has gotten troops out of countries Bush...
2846,2846,obama,Pro,p-economy,K33,fincial reform- made the banks do their buince...


In [54]:
docs = reasons_df['line'].tolist()
stances = reasons_df['stance'].tolist()
docs[:2]
stances[:2]

['Con', 'Con']

In [10]:
# settings that you use for count vectorizer will go here
tfidf_vectorizer=TfidfVectorizer(use_idf=True, sublinear_tf=True)
# just send in all your docs here
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)

In [11]:
(tfidf_vectorizer_vectors.shape[0])

2848

In [12]:
# # this is just for show, not necessary
# # get the first vector out (for the first document)
# first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]
# # place tf-idf values in a pandas data frame
# df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
# df.sort_values(by=["tfidf"],ascending=False)

In [13]:
tfidf_vocab = tfidf_vectorizer.vocabulary_
type(tfidf_vocab)

dict

In [14]:
tfidf_vectorizer_vectors.shape

(2848, 5810)

In [15]:
'may' in tfidf_vocab.keys()

True

In [16]:
def get_avg_vector_tfidf(w2v_model, vocab, words):
    # remove out-of-vocabulary words
    words = [vocab[word] * w2v_model[word] for word in words if (word in w2v_model.vocab and word in vocab.keys())]
    if len(words) >= 1:
        return np.mean(words, axis=0)
    else:
        return []

In [17]:
def embed_sentence(s, vocab):
    if s[-1] == '.':
        s=s[:-1]
    words = s.split()
    return get_avg_vector_tfidf(wv_from_bin, vocab, words)
test = (embed_sentence('It is a fact that there are differences between people. Hence, there should sometimes be differences in the way people are treated.', tfidf_vocab))


In [18]:
(np.mean(test, axis=0)).shape

()

In [19]:
CoPAs = pd.read_pickle(r'principle_argument_CoPA/PA_list.pkl')
CoPAs[:2]

['Many adolescents cannot make responsible decisions',
 'Adolescents are as capable as adults']

In [20]:
# settings that you use for count vectorizer will go here
tfidf_vectorizer_copa=TfidfVectorizer(use_idf=True, sublinear_tf=True)
# just send in all your docs here
tfidf_vectorizer_copa_vectors=tfidf_vectorizer_copa.fit_transform(CoPAs)
tfidf_vectorizer_copa_vectors.shape
tfidf_copa_vocab = tfidf_vectorizer_copa.vocabulary_


In [21]:
tfidf_copa_vocab

{'many': 210,
 'adolescents': 11,
 'cannot': 48,
 'make': 207,
 'responsible': 304,
 'decisions': 81,
 'are': 20,
 'as': 21,
 'capable': 49,
 'adults': 12,
 'animals': 18,
 'should': 324,
 'not': 233,
 'be': 31,
 'treated': 368,
 'property': 282,
 'there': 356,
 'is': 186,
 'nothing': 234,
 'wrong': 402,
 'with': 395,
 'using': 376,
 'to': 364,
 'further': 145,
 'human': 162,
 'interests': 182,
 'public': 290,
 'utility': 377,
 'best': 38,
 'served': 320,
 'by': 46,
 'actions': 6,
 'coordinated': 71,
 'central': 51,
 'government': 150,
 'interest': 181,
 'and': 17,
 'propelled': 281,
 'voluntary': 384,
 'interactions': 180,
 'ones': 243,
 'dictated': 90,
 'prohibiting': 278,
 'products': 276,
 'activities': 7,
 'makes': 208,
 'them': 354,
 'less': 196,
 'visible': 383,
 'available': 25,
 'thus': 361,
 'harmful': 154,
 'prohibition': 279,
 'counterproductive': 73,
 'only': 244,
 'leads': 194,
 'increased': 174,
 'demand': 85,
 'humanity': 163,
 'must': 224,
 'embrace': 110,
 'clean': 56

In [22]:
def cosine(u, v):
        return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
def measure_pa_similarity(dataframe):
    CoPAs_embedding = [] # works
    for arg in CoPAs:
        CoPAs_embedding.append(embed_sentence(arg.lower(), tfidf_copa_vocab))
#     print(CoPAs_embedding)
    similarity_scores = []
    for index, row in dataframe.iterrows():
        sentence=row['line']
        claim_embedding = embed_sentence(sentence.lower(), tfidf_vocab)
#         print('claim_embedding ',claim_embedding)
        similarity = []
        for arg in CoPAs_embedding:
            similarity.append(cosine(claim_embedding, arg))
        similarity_scores.append(similarity)
    # similarity_scores is a list of a list of 74 embeddings for each line
    # len(similarity_scores[0]) ==  74
    dataframe['similarity'] = pd.Series(similarity_scores)
    return dataframe, CoPAs_embedding
reasons_df, copa_embeddings = measure_pa_similarity(reasons_df)

In [23]:
reasons_df

Unnamed: 0,id,topic,stance,substance,source,line,similarity
0,0,abortion,Con,c-adopt,Q34,And if it is not possible for your to have a b...,"[0.48416767, 0.4241976, 0.54075974, 0.69915974..."
1,1,abortion,Con,c-kill,Q34,I believe that abortion cannot be justified be...,"[0.47884166, 0.43939564, 0.5063248, 0.70720005..."
2,2,abortion,Con,c-baby_right,Q34,"Moreover, United Nations declaration says chil...","[0.43071118, 0.45206025, 0.48450536, 0.5497553..."
3,3,abortion,Con,c-baby_right,M25,"Even if it doesn't have a brain, my belief is ...","[0.44407952, 0.4239362, 0.49705744, 0.70402575..."
4,4,abortion,Con,c-sex,O43,"Can't you use a condom while having sex. Yes, ...","[0.41161808, 0.39982748, 0.43759474, 0.6643834..."
...,...,...,...,...,...,...,...
2843,2843,obama,Pro,p-quality,B64,"While I wish he would have accomplished more, ...","[0.5100023, 0.41690615, 0.553481, 0.7322623, 0..."
2844,2844,obama,Pro,p-republicans,B64,Obama inherited many of the deficiencies lefto...,"[0.39890993, 0.26361057, 0.3582708, 0.5272109,..."
2845,2845,obama,Con,c-War,F38,He has gotten troops out of countries Bush...,"[0.39314574, 0.4079126, 0.45908293, 0.6267522,..."
2846,2846,obama,Pro,p-economy,K33,fincial reform- made the banks do their buince...,"[0.48969442, 0.34099227, 0.43148765, 0.6171639..."


In [24]:
reasons_df['similarity']

0       [0.48416767, 0.4241976, 0.54075974, 0.69915974...
1       [0.47884166, 0.43939564, 0.5063248, 0.70720005...
2       [0.43071118, 0.45206025, 0.48450536, 0.5497553...
3       [0.44407952, 0.4239362, 0.49705744, 0.70402575...
4       [0.41161808, 0.39982748, 0.43759474, 0.6643834...
                              ...                        
2843    [0.5100023, 0.41690615, 0.553481, 0.7322623, 0...
2844    [0.39890993, 0.26361057, 0.3582708, 0.5272109,...
2845    [0.39314574, 0.4079126, 0.45908293, 0.6267522,...
2846    [0.48969442, 0.34099227, 0.43148765, 0.6171639...
2847    [0.44804606, 0.35858345, 0.51951706, 0.5623813...
Name: similarity, Length: 2848, dtype: object

In [25]:
len(copa_embeddings) # should be a list of vectors

74

In [26]:
tfidf_copa_vocab

{'many': 210,
 'adolescents': 11,
 'cannot': 48,
 'make': 207,
 'responsible': 304,
 'decisions': 81,
 'are': 20,
 'as': 21,
 'capable': 49,
 'adults': 12,
 'animals': 18,
 'should': 324,
 'not': 233,
 'be': 31,
 'treated': 368,
 'property': 282,
 'there': 356,
 'is': 186,
 'nothing': 234,
 'wrong': 402,
 'with': 395,
 'using': 376,
 'to': 364,
 'further': 145,
 'human': 162,
 'interests': 182,
 'public': 290,
 'utility': 377,
 'best': 38,
 'served': 320,
 'by': 46,
 'actions': 6,
 'coordinated': 71,
 'central': 51,
 'government': 150,
 'interest': 181,
 'and': 17,
 'propelled': 281,
 'voluntary': 384,
 'interactions': 180,
 'ones': 243,
 'dictated': 90,
 'prohibiting': 278,
 'products': 276,
 'activities': 7,
 'makes': 208,
 'them': 354,
 'less': 196,
 'visible': 383,
 'available': 25,
 'thus': 361,
 'harmful': 154,
 'prohibition': 279,
 'counterproductive': 73,
 'only': 244,
 'leads': 194,
 'increased': 174,
 'demand': 85,
 'humanity': 163,
 'must': 224,
 'embrace': 110,
 'clean': 56

USELESS USELESS USELESS IGNORE IGNORE IGNORE BELOW THIS POINT

In [52]:
from sklearn.linear_model import LogisticRegression
from zeugma.embeddings import EmbeddingTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [51]:
# toy playing around w embeddings
glove = EmbeddingTransformer('glove')
embeddings = glove.transform(['what is zeugma like this fool what he doin These mappings come in different formats. Most pre-trained embeddings are available as a space-separated text file, where each line contains a word in the first position', 'a figure of speech'])

cosine_similarity(embeddings)[0, 1]

0.96027905

In [50]:
embeddings.shape

(2, 25)

In [81]:
from sklearn.model_selection import train_test_split
X=docs
y=stances
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [82]:
X_train[6]

'It is murder, and there is no way around it.'

In [83]:
y_train[6]

'Con'

In [84]:
len(X_train)

2136

In [85]:
len(X_test)

712

In [86]:
import time
start_time = time.time()

glove = EmbeddingTransformer('glove')
x_train = glove.transform(X_train)

model = LogisticRegression()
model.fit(x_train, y_train)

x_test = glove.transform(X_test)
model.predict(x_test)

print("--- %s seconds ---" % (time.time() - start_time))

--- 37.28241801261902 seconds ---


In [87]:
len(x_train)

2136

In [88]:
prediction1 = model.predict(x_test)
prediction1


array(['Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Con', 'Pro', 'Pro', 'Pro',
       'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro',
       'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Con', 'Pro', 'Pro',
       'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro',
       'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro',
       'Con', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro',
       'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro',
       'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Con',
       'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro',
       'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro',
       'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro',
       'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro',
       'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro',
       'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro', 'Pro',
       'Con', 'Pro',

In [None]:
--- 44.127750873565674 seconds ---