# Persian Stance Classification - Deep Learning

In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import numpy as np
import numpy as np
import os.path as path

# Mount Google Drive

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# input files
cleaned_path = "/content/drive/MyDrive/Stance Detection Project/dataset cleaned/Clean_Claim_Body.csv"
train_path = "/content/drive/MyDrive/Stance Detection Project/dataset cleaned/train_data.csv"
test_path = "/content/drive/MyDrive/Stance Detection Project/dataset cleaned/test_data.csv"

fasttext_path = "/content/drive/MyDrive/Stance Detection Project/Baseline Zarharan/cc.fa.300.vec"   # this is a text file containing FastText word embeddings for Farsi

# output file
sequences_file = '/content/drive/MyDrive/Stance Detection Project/Baseline Zarharan/features/sequences.npy'
FEATURES_DIR = '/content/drive/MyDrive/Stance Detection Project/Baseline Zarharan/features/'

# Read Cleaned Data from CSV File

In [None]:
import pandas as pd
dataset_clean = pd.read_csv(cleaned_path, index_col = 0, )

In [None]:
clean_claim = dataset_clean['claim']
clean_body = dataset_clean['body']

In [None]:
dataset_clean.head()

Unnamed: 0,claim,body,label
0,کلاهبرداری از رانندگان با شگرد نشت بنزین !,به گزارش خبرنگار گروه جامعه خبرگزاری میزان،29 ...,Discuss
1,تجاوز به دختر بازداشت شده و واژگونی ون گشت ارش...,انتشار کلیپ واژگونی ماشین گشت ارشاد توسط مردم ...,Discuss
2,تعظیم 20 دقیقه ای وزیر نیرو ژاپن به علت قطع بر...,وزیر نیروی ژاپن به علت قطع شدن برق؛ به همان مد...,Agree
3,سرمربیگری گاس هیدینک برای تراکتورسازی,به تازگی محمد تقوی استعفای خود را از سرمربیگری...,Discuss
4,کشف موجود عجیبی شبیه انسان در یک حفاری در پاکس...,پس از 20 سال حفاری با دقتی باورنکردنی، سرانجام...,Unrelated


# Read Data Train and Test from CSV Files

In [None]:
data_train = pd.read_csv(train_path, index_col = 0, )
data_test = pd.read_csv(test_path, index_col = 0, )

In [None]:
len(data_train), len(data_test)

(1597, 400)

# Download Fasttext Word Vectors File

In [None]:
# download fasttext word vectors for persian (text file version)
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.vec.gz
# ------------------------------------------------------------------------------
# unzip fasttext word vectors for persian
# !gunzip cc.fa.300.vec.gz

# then copy unziped text file to you google drive for future use

# Load Function

In [None]:
def load_embedding_pandas(FILE, type="w2v"):
  embeddings_index=dict()
  f = open(FILE)
  for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
  f.close()
  print('Loaded %s word vectors.' % len(embeddings_index))
  return embeddings_index

# Reduction Function

In [None]:
import pickle
import nltk

def create_embedding_lookup_pandas(text_list, max_nb_words, embedding_dim, embedding,
                            embedding_lookup_name, embedding_vocab_name, rdm_emb_init=False, add_unknown=False, tokenizer=None, init_zeros = False):
    """
    Creates the claim embedding lookup table if it not already exists and returns the vocabulary for it
    :param text_list:
    :param max_nb_words:
    :param embedding_dim:
    :param GloVe_vectors:
    :param embedding_lookup_name:
    :param embedding_vocab_name:
    :return:
    """
    #del GloVe_vectors

    # if ...embedding.npy or vocab.pkl files don't exist:
    if not path.exists(FEATURES_DIR + embedding_lookup_name) or not path.exists(FEATURES_DIR + embedding_vocab_name):
        print("can't find npy or pkl file!")
        vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words=None, tokenizer=tokenizer,
                                            max_features=max_nb_words, use_idf=True)
        vectorizer.fit_transform(text_list)
        vocab = vectorizer.vocabulary_


        # do not use 0 since we want to use masking in the LSTM later on
        for word in vocab.keys():
            vocab[word] += 1
        if add_unknown == True:
            max_index = max(vocab.values())
            vocab["UNKNOWN"] = max_index+1

        # prepare embedding - create matrix that holds the glove vector for each vocab entry
        if rdm_emb_init == True:
            embedding_lookup = np.random.random((len(vocab) + 1, embedding_dim))
            zero_vec = np.zeros((embedding_dim))
            embedding_lookup[0] = zero_vec # for masking
        else:
            embedding_lookup = np.zeros((len(vocab) + 1, embedding_dim))

        if init_zeros == False:
            for word, i in vocab.items():
                if word == "UNKNOWN":
                    embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dim)
                    #print(embedding_vector)
                else:
                    try:
                        embedding_vector = embedding.loc[word].as_matrix()
                    except KeyError: #https://stackoverflow.com/questions/15653966/ignore-keyerror-and-continue-program
                        continue
                if embedding_vector is not None:
                    # words not found in embedding index will be all-zeros.
                    embedding_lookup[i] = embedding_vector
        print("created embedding lookup!")
        #print(embedding_lookup[-1])
        # save embedding matrix
        np.save(FEATURES_DIR + embedding_lookup_name, embedding_lookup)
        print("embedding matrix saved!")
        # save vocab
        with open(FEATURES_DIR + embedding_vocab_name, 'wb') as f:
            pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL)
        print("vocab saved!")

        print("Embedding lookup table shape for " + embedding_lookup_name + " is: " + str(embedding_lookup.shape))
    #if both .npy and .pkl files exist:
    else:
        print("found npy and pkl files!")
        with open(FEATURES_DIR + embedding_vocab_name, "rb") as f:
            vocab = pickle.load(f)

    print("Vocab size for " + embedding_vocab_name + " is: " + str(len(vocab)))

    return vocab

In [None]:
def text_to_sequences_fixed_size(texts, vocab, MAX_SENT_LENGTH, save_full_text=False, take_full_claim = False):
    """
    Turns sentences of claims into sequences of indices provided by the given vocab.
    Unknown words will get an extra index, if
    the vocab has a token "UNKNOWN". The method takes the longest sentence of the claims, if the
    claim should have more than one sentence.
    :param texts:
    :param vocab:
    :param MAX_SENT_LENGTH:
    :return:
    """
    data = np.zeros((len(texts), MAX_SENT_LENGTH), dtype='int32')

    claims = []
    if take_full_claim == False:
        for claim in texts:
            claim_sents = nltk.sent_tokenize(claim)
            word_count_fct = lambda sentence: len(nltk.word_tokenize(sentence)) # take longest sentence of claim if it has more than one
            claims.append(max(claim_sents, key=word_count_fct))
    else:
        claims = texts

    data_string_dict = {}
    for j, claim in tqdm(enumerate(claims)):
        claim_tokens = nltk.word_tokenize(claim.lower())

        data_string = ""
        if save_full_text == True:
            for token in claim_tokens:
                data_string += token + " "
            data_string = data_string[:-1]
            data_string_dict[j] = data_string

        for i, token in enumerate(claim_tokens):
            if i < MAX_SENT_LENGTH:
                index = vocab.get(token, "UNKNOWN")
                if index == "UNKNOWN":
                    index = vocab.get(index, None)
                if index != None:
                    data[j, i] = index

    if save_full_text == True:
        return data, data_string_dict
    else:
        return data



In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def single_flat_LSTM_50d_100(headlines, bodies, GloVe_vectors):

    #########################
    # PARAMETER DEFINITIONS #
    #########################
    method_name = "single_flat_LSTM_50d_100"
    # location path for features
    PARAM_DICT_FILENAME = method_name+"_param_dict.pkl"

    param_dict = {
        "MAX_NB_WORDS": 50000,  # size of the vocabulary

        # sequence lengths
        "MAX_SEQ_LENGTH": 100, #1000

        # embedding specific values
        "EMBEDDING_DIM": 50,  # dimension of the GloVe embeddings

        # embedding file names
        "EMBEDDING_FILE": method_name+"_embedding.npy",

        # vocab file names
        "VOCAB_FILE": method_name+"_vocab.pkl",
    }


    ###############################################
    # GET VOCABULARY AND PREPARE EMBEDDING MATRIX #
    ###############################################

    # load all claims, orig_docs and evidences
    all = headlines.copy()
    all.extend(bodies.copy())

    # create and save the embedding matrices for claims, orig_docs and evidences
    vocab = create_embedding_lookup_pandas(all, param_dict["MAX_NB_WORDS"], param_dict["EMBEDDING_DIM"],
                                           GloVe_vectors, param_dict["EMBEDDING_FILE"], param_dict["VOCAB_FILE"], init_zeros=False,
                                           add_unknown=True, rdm_emb_init=True, tokenizer=nltk.word_tokenize)

    # unload GloVe_vectors in order to make debugging possible
    del GloVe_vectors


    #################################################
    # Create sequences and embedding for the claims #
    #################################################
    print("Create sequences and embedding for the heads")

    concatenated = []
    for i in range(len(headlines)):
        concatenated.append(headlines[i] + ". " + bodies[i])

    # replace tokens of claims by vocabulary ids - the ids refer to the index of the embedding matrix which holds the word embedding for this vocab word
    sequences = text_to_sequences_fixed_size(concatenated, vocab, param_dict["MAX_SEQ_LENGTH"], save_full_text=False,
                                             take_full_claim=True)



    #################################################
    # SAVE PARAM_DICT AND CONCATENATE TRAINING DATA #
    #################################################

    # save param_dict
    with open(FEATURES_DIR+PARAM_DICT_FILENAME, 'wb') as f:
        pickle.dump(param_dict, f, pickle.HIGHEST_PROTOCOL)
    print("Save PARAM_DICT as " + FEATURES_DIR+PARAM_DICT_FILENAME)

    return sequences

# Generate Word Embedding Matrix

In [None]:
%%time

# load fasttext vectors from disk by glove library
# takes a little time...don't worry!
GloVe_vectors = load_embedding_pandas(fasttext_path)


# create a dataframe from glove vectors
# takes a little time...don't worry!
g_vec = pd.DataFrame.from_dict(GloVe_vectors)


Loaded 2000000 word vectors.


In [None]:
# print a sample word-vector
g_vec.iloc[:,200]

0      0.0471
1      0.0085
2      0.0203
3      0.0081
4      0.0089
        ...  
295   -0.0165
296   -0.0096
297    0.0135
298    0.0094
299   -0.0394
Name: اسلامی, Length: 300, dtype: float32

In [None]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
%%time

# construct the required word embeddng matrix and embed words using fasttext library
# all dataset instances are embedded here with dimension of vectors: 50, max sequence length: 100

# Reduction of Glove Word Embedding Model to Dimension of 50 instead of 300
# Generated File is Saved to Features Directory of the Project
# This File is Later Used by Feature Extraction Function
sequences = single_flat_LSTM_50d_100(clean_claim.to_list(), clean_body.to_list(), g_vec)   # dimension of vectors: 50, max sequence length: 100

print(sequences.shape)

can't find npy or pkl file!




created embedding lookup!
embedding matrix saved!
vocab saved!
Embedding lookup table shape for single_flat_LSTM_50d_100_embedding.npy is: (48758, 50)
Vocab size for single_flat_LSTM_50d_100_vocab.pkl is: 48757
Create sequences and embedding for the heads


1997it [00:03, 516.65it/s]


Save PARAM_DICT as /content/drive/MyDrive/Stance Detection Project/Baseline Zarharan/features/single_flat_LSTM_50d_100_param_dict.pkl
(1997, 100)
CPU times: user 7.41 s, sys: 34.6 ms, total: 7.45 s
Wall time: 7.7 s


# Save Sequences

In [None]:
# save sequences of token_ids to file
np.save(sequences_file, sequences)