In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import csv
import re
import random

In [2]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kate\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Kate\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kate\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
#read lemmatized data 
within_df =  pd.read_csv('lemmatized-data/within_test_gh_new.csv',encoding='utf-8', index_col='id')
cross_df = pd.read_csv('lemmatized-data/cross_test_github.csv',encoding='utf-8', index_col='id')


#uncomment the section below to read the unlemmatized data 

#within_df = pd.read_csv('data/same-side-classification/within-topic/within_test.csv',encoding='utf-8', index_col='id')
#cross_df = pd.read_csv('data/same-side-classification/cross-topic/cross_test.csv',encoding='utf-8', index_col='id')

In [5]:
within_df['iss_test'] = np.nan
within_df['random_arg1'] = False
within_df['random_arg2'] = False

In [6]:
within_df['iss_test'] = np.nan
cross_df['iss_test'] = np.nan
within_df['random_arg1'] = False
within_df['random_arg2'] = False
cross_df['random_arg1'] = False
cross_df['random_arg2'] = False

In [7]:
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer


def get_wordnet_pos(treebank_tag):
        """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            # As default pos in lemmatization is Noun
            return wordnet.NOUN

def lemmatize_stemming(token, pos_tag):
    stemmer = SnowballStemmer("english") #pOrter, M. "An algorithm for suffix stripping."
    return stemmer.stem(WordNetLemmatizer().lemmatize(token, pos=pos_tag))

def preprocess(text):
    lemma = []
    for sentence in sent_tokenize(text):
        sentence = sentence.replace('\n', ' ').strip()
        tokens = [token for token in word_tokenize(sentence)]
        pos_tags = nltk.pos_tag(tokens)
        
        for idx in range(0,len(tokens)):
            token = tokens[idx].lower()
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                wordnet_pos = get_wordnet_pos(pos_tags[idx][1])
                l_ = lemmatize_stemming(token, wordnet_pos)
                lemma.append(l_)
    return lemma

In [None]:
#lemmatize data sets if needed; uncomment the section
#within_df =  within_df.apply(get_lemma, axis=1)
#cross_df =  cross_df.apply(get_lemma, axis=1)

In [8]:
#read the files with "good" and "bad" words
with open('../positive-words.txt', 'r') as f:
    pos = [line.strip() for line in f]
    
with open('../negative-words.txt', 'r') as f:
    neg = [line.strip() for line in f]

In [9]:
# lemmatize positive and negative words
pos_lem = []
for sent in pos:
    pos_lem.append(' '.join(preprocess(sent)))
neg_lem = []
for sent in neg:
    neg_lem.append(' '.join(preprocess(sent)))
pos_lem = list(dict.fromkeys(pos_lem))
neg_lem = list(dict.fromkeys(neg_lem))

In [10]:
''' 
count the amount of positive/negative words for each argument:
if more positive -> argument gets the mark 1
if more negative -> argument gets the mark 0
if amount of pos/neg equal -> argument gets a random mark and a "True"-mark for the column "random_arg1" 

if both arguments have an equal mark -> column "is_same_side"=True, otherwise False
'''


def choose_your_side(row):
    if pd.isnull(row['argument1_lemmas']):
        arg1 = ' '
    else:
        arg1 = row['argument1_lemmas'].split()
    
    if pd.isnull(row['argument2_lemmas']):
        arg2 = ' '
    else:
        arg2 = row['argument2_lemmas'].split()
    
    arg1_pos = [i for i in arg1 if i in pos_lem]
    arg2_pos = [i for i in arg2 if i in pos_lem]
    arg1_neg = [i for i in arg1 if i in neg_lem]
    arg2_neg = [i for i in arg2 if i in neg_lem]
    if len(arg1_pos) > len(arg1_neg):
        arg1_label = 1
    elif len(arg1_neg) > len(arg1_pos):
        arg1_label = 0
    else:
        arg1_label = bool(random.getrandbits(1))
        row['random_arg1'] = True
    if len(arg2_pos) > len(arg2_neg):
        arg2_label = 1
    elif len(arg2_neg) > len(arg2_pos):
        arg2_label = 0
    else:
        arg2_label = bool(random.getrandbits(1))
        row['random_arg2'] = True
    if arg1_label is arg2_label:
        row['iss_test'] = True
    else:
        row['iss_test'] = False
    return row

In [11]:
#apply the function
within_tested = within_df.apply(choose_your_side, axis = 1)
cross_tested = cross_df.apply(choose_your_side, axis = 1)

In [12]:
cross = pd.DataFrame(index = cross_tested.index, columns = ['label'])
within = pd.DataFrame(index = within_tested.index, columns = ['label'])
cross['label'] = cross_tested['iss_test']
within['label'] = within_tested['iss_test']

In [11]:
#save predictions
cross.to_csv("cross_test_predictions_rule-based.csv")
within.to_csv("within_test_new_predictions_rule-based.csv")