# Persian Stance Classification - Deep Learning

In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import numpy as np

# Mount Google Drive

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# input files
cleaned_path = "/content/drive/MyDrive/Stance Detection Project/dataset cleaned/Clean_Claim_Body.csv"
train_path = "/content/drive/MyDrive/Stance Detection Project/dataset cleaned/train_data.csv"
test_path = "/content/drive/MyDrive/Stance Detection Project/dataset cleaned/test_data.csv"

stopwords_path = "/content/drive/MyDrive/Stance Detection Project/Baseline Zarharan/StopWords_fa.txt"

# output file
feature_file = '/content/drive/MyDrive/Stance Detection Project/Baseline Zarharan/features/negated.npy'

# Read Cleaned Data from CSV File

In [4]:
import pandas as pd
dataset_clean = pd.read_csv(cleaned_path, index_col = 0, )

In [5]:
clean_claim = dataset_clean['claim']
clean_body = dataset_clean['body']

In [6]:
dataset_clean.head()

Unnamed: 0,claim,body,label
0,کلاهبرداری از رانندگان با شگرد نشت بنزین !,به گزارش خبرنگار گروه جامعه خبرگزاری میزان،29 ...,Discuss
1,تجاوز به دختر بازداشت شده و واژگونی ون گشت ارش...,انتشار کلیپ واژگونی ماشین گشت ارشاد توسط مردم ...,Discuss
2,تعظیم 20 دقیقه ای وزیر نیرو ژاپن به علت قطع بر...,وزیر نیروی ژاپن به علت قطع شدن برق؛ به همان مد...,Agree
3,سرمربیگری گاس هیدینک برای تراکتورسازی,به تازگی محمد تقوی استعفای خود را از سرمربیگری...,Discuss
4,کشف موجود عجیبی شبیه انسان در یک حفاری در پاکس...,پس از 20 سال حفاری با دقتی باورنکردنی، سرانجام...,Unrelated


# Read Data Train and Test from CSV Files

In [7]:
data_train = pd.read_csv(train_path, index_col = 0, )
data_test = pd.read_csv(test_path, index_col = 0, )

In [8]:
len(data_train), len(data_test)

(1597, 400)

# Cleaning Functions - Hazm

In [9]:
!pip install hazm



In [10]:
try:
    from hazm import *
    print('hazm is already installed')
except ImportError as e:
    raise Exception('hazm is not installed')

# installation command:
# !pip install hazm

hazm is already installed


In [11]:
from __future__ import unicode_literals
from hazm import *

In [12]:
stop_list = []
with open(stopwords_path, 'r', encoding="utf-8") as f:
    for word in f:
        word = word.split('\n')
        stop_list.append(word[0])

In [13]:

def remove_stopwords(text):
    sw_data = []
    for i in text:
        for j in stop_list:
            if j in word_tokenize(i):
                i.replace(j, '')
        sw_data.append(i)
    return sw_data


def remove_slash(text):
    ext_data = []
    for i in text:
        if '/' in i:
            spl = i.split('/')
            if 'شایعه' in spl[-1]:
                i = i.replace(spl[-1], '')
        ext_data.append(i)
    return ext_data


import re
r = re.compile("[\!\;,؟:?،؛.+»«<>|\#(\)\-\/\'\"]")
def remove_punc(text):
    punc = []
    for i in text:
        punc.append(r.sub("", i))
    return punc

extra_str = ['\u200c', '\u200d', '\u200e', '\u200b', '\r', '\n', '\ufeff']
def clean_data(text):

    print("start cleaning data..")

    text = remove_slash(text)

    clean_data = []
    for i in text:
        for j in extra_str:
            if j in i:
                i = i.replace(j,'')
        clean_data.append(i)

    print("data is ready!")
    return clean_data

In [14]:
import re
r = re.compile("[\!\;,؟:?،؛.+»«<>|\#(\)\-\/\'\"]")

def clean(text):
    return r.sub("", text)

# Negated Context Feature Extractor

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# this function takes a very long time to finish execution!
def negated_context_word_12grams_concat_tf5000_l2_all_data(headlines, bodies):
    """
    Negates string after special negation word by adding a "NEG_" in front
    of every negated word, until a punctuation mark appears.
    Source:
        NRC-Canada: Buidling the State-of-the-Art in Sentiment Analysis of Tweets
        http://sentiment.christopherpotts.net/lingstruc.html
        http://stackoverflow.com/questions/23384351/how-to-add-tags-to-negated-words-in-strings-that-follow-not-no-and-never


    :param headlines:
    :param bodies:
    :return:
    """
    def get_negated_text(text):
      sens = text.replace(';','.').replace(',','.').replace('!','.').replace(':','.').replace('،', '').split('.')
      li_1 = ['هیچ', 'اصلا', 'هیچگونه']
      li_2 = [ 'ندارد', 'نمیتواند']
      jomles = []
      for sen in sens:
        first, second = 0 , 0
        flag_1, flag_2 = False, False
        tokens = word_tokenize(sen)
        jomle = []
        for i in range(len(tokens)):
          if tokens[i] in li_1 and flag_1 == False:
            first = i
            flag_1 = True
          if tokens[i] in li_2 and flag_2 == False:
            second = i
            flag_2 = True
        if (second > first) and (flag_1 == True) and (flag_2 == True):
          for j in range (first + 1 , second-1 ):
            sen = sen.replace(tokens[j], 'NEG_'+tokens[j])
        jomles.append(sen)

      jomles = '. '.join(jomles)

      return jomles

    def combine_head_and_body(headlines, bodies):
        head_and_body = [headline + " " + body for i, (headline, body) in
                         enumerate(zip(headlines, bodies))]

        return head_and_body

    def get_vocab(neg_headlines, neg_bodies):
        neg_headlines = remove_stopwords(neg_headlines)
        neg_bodies = remove_stopwords(neg_bodies)
        tf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000, use_idf=False,
                                        norm='l2')
        tf_vectorizer.fit_transform(combine_head_and_body(neg_headlines, neg_bodies))
        vocab = tf_vectorizer.vocabulary_

        return vocab

    def get_features(neg_headlines_test, neg_bodies_test, vocab):
        neg_headlines_test = remove_stopwords(neg_headlines_test)
        neg_bodies_test = remove_stopwords(neg_bodies_test)

        tf_vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_test_head = tf_vectorizer_head.fit_transform(neg_headlines_test)

        tf_vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_test_body = tf_vectorizer_body.fit_transform(neg_bodies_test)

        X_test = np.concatenate([X_test_head.toarray(), X_test_body.toarray()], axis=1)
        return X_test

#     h, b = get_head_body_tuples(include_holdout=True)
#     h_test, b_test = get_head_body_tuples_test()

    h, b = data_train['claim'].tolist() , data_train['body'].tolist()
    h_test, b_test = data_test['claim'].tolist(), data_test['body'].tolist()

    # Comment out for clean ablation tests
    h.extend(h_test)
    b.extend(b_test)

    neg_headlines_all = [get_negated_text(h) for h in h]
    neg_bodies_all = [get_negated_text(b) for b in b]
    neg_headlines = [get_negated_text(h) for h in headlines]
    neg_bodies = [get_negated_text(b) for b in bodies]

    vocab = get_vocab(neg_headlines_all, neg_bodies_all)
    X_train = get_features(neg_headlines, neg_bodies, vocab)

    return X_train

In [16]:
%%time

# extract negated context features and save to file for future use
x = negated_context_word_12grams_concat_tf5000_l2_all_data(clean_claim, clean_body)

print(x.shape)

(1997, 10000)
CPU times: user 19min 29s, sys: 1.98 s, total: 19min 31s
Wall time: 19min 45s


# Save Features

In [17]:
np.save(feature_file, x)