In [None]:
#Required Depencies. Please run this cell to install external packages needed to run the code
!pip install polyglot
!polyglot download sgns2.bn
!pip install PyICU
!pip install pycld2
!pip install Morfessor
!polyglot download sentiment2.bn

Please refer to the master google drive folder mentioned in the readme file. You will need to download all those data files on your system.

In [None]:
#Importing all the packages needed
import pandas as pd
import polyglot
from polyglot.text import Text, Word, Downloader
import numpy as np
from nltk import ngrams, bigrams

In [None]:
bengali_dataset = pd.read_csv('./Data/BengaliSentences.csv')
headline = bengali_dataset['headline']
body = bengali_dataset['content']
labels = [int(x) for x in bengali_dataset['label']]

# There are 7202 headline-body pairs in the dataset, labels are 1 for authentic news and 0 for fake news

# Creating Neural Embeddings for the sentences

In [None]:

neural_vec = []
for h,b in zip(headline,body):
  headline_words = h.split()
  body_words = b.split()
  head_vec = np.zeros((1,256),dtype='float32')
  body_vec = np.zeros((1,256),dtype='float32')
  for w in headline_words:
    word = Word(w,language='bn')
    try:
      head_vec+=word.vector.reshape((1,256))
    except:
      continue
  head_vec = head_vec / len(headline_words)

  for w in body_words:
    word = Word(w,language='bn')
    try:
      body_vec+=word.vector.reshape((1,256))
    except:
      continue
  body_vec = body_vec / len(body_words)

  final_vec = np.concatenate((head_vec,body_vec),axis = 1)

  neural_vec.append(final_vec)

In [None]:
#Saving the neural embeddings - 
np.save(arr=np.array(neural_vec),file='./Data/bengali_neural.npy'

# Creating Statistical Embeddings for the sentences

In [None]:
# create a list of unique headline words
headline_corpus_words = []
for head in headline:
  for word in head.split():
    headline_corpus_words.append(word)
headline_corpus_words = set(headline_corpus_words)
print(len(headline_corpus_words))

In [None]:
# computing term frequency for Headline
n = 7202
no_words = 2075
tf_matrix_head = []
for i,head in enumerate(headline):
  tf_matrix_line = []
  for vocab_word in headline_corpus_words:
    if(vocab_word not in head.split()):
      tf_matrix_line.append(0)
    else:
      n_count = 0
      for word in head.split():
        if (word == vocab_word):
          n_count+=1
      tf_matrix_line.append(n_count*1.0/len(head.split()))
  tf_matrix_head.append(tf_matrix_line)

tf_matrix_head = np.array(tf_matrix_head)
print(tf_matrix_head.shape) # should be (7202, 12292) dimensional

In [None]:
# computing idf terms for each headline word
idf_headline_corpus_words = []
for word in headline_corpus_words:
  count = 0
  for head in headline:
    for h in head.split():
      if(h==word):
        count+=1
        break
  idf = np.log(7202.0/count)
  idf_headline_corpus_words.append(idf)

print(idf_headline_corpus_words) # should be 12292

In [None]:
# computing IDF terms for headline
n = 7202
no_words = 2075
idf_matrix_head = []
for i,head in enumerate(headline):
  print(i) 
  idf_matrix_line = []
  for m in range(12292):
    idf_matrix_line.append(0)
  for word in head.split():
    k = -1
    for j, vocab_word in enumerate(headline_corpus_words):
      if(word==vocab_word):
        k = j
        break
    idf_matrix_line[k] = idf_headline_corpus_words[k]
  idf_matrix_head.append(idf_matrix_line)

idf_matrix_head = np.array(idf_matrix_head)
print(idf_matrix_head.shape) # should be (7202, 12292)


In [None]:
# multiplying these matrices to form the tf-idf terms
tfidf_matrix_headline = np.multiply(tf_matrix_head,idf_matrix_head)
print(tfidf_matrix_headline.shape) # should be (7202,12292)

In [None]:
# computing a list of unique body words
body_corpus_words = []
for b in body:
  for word in b.split():
    body_corpus_words.append(word)
body_corpus_words = set(body_corpus_words)
print(len(body_corpus_words)) # should be 117023

In [None]:
# computing term frequency for body
n = 7202
no_words = 117023
tf_matrix = []
for i,b in enumerate(body):
  tf_matrix_line = []
  for vocab_word in body_corpus_words:
    if(vocab_word not in b.split()):
      tf_matrix_line.append(0)
    else:
      n_count = 0
      for word in b.split():
        if (word == vocab_word):
          n_count+=1
      tf_matrix_line.append(n_count*1.0/len(b.split()))
  tf_matrix.append(tf_matrix_line)

tf_matrix_body = np.array(tf_matrix_body)
print(tf_matrix_body.shape) # should be (7202,117023)

In [None]:
# computing IDF terms for each body word
idf_body_corpus_words = []
for i,word in enumerate(body_corpus_words):
  print(i)
  count = 0
  for b in body:
    for b_word in b.split():
      if(b_word==word):
        count+=1
        break
  idf = np.log(7202.0/count)
  idf_body_corpus_words.append(idf)

print(len(idf_body_corpus_words)) # should be 117023

In [None]:
# computing idf term for body sentences
n = 7202
no_words = 117023
idf_matrix_body = []
for i,b in enumerate(body):
  print(i) 
  idf_matrix_line = []
  for m in range(117023):
    idf_matrix_line.append(0)
  for word in b.split():
    k = -1
    for j, vocab_word in enumerate(body_corpus_words):
      if(word==vocab_word):
        k = j
        break
    idf_matrix_line[k] = idf_body_corpus_words[k]
  idf_matrix_body.append(idf_matrix_line)

idf_matrix_body = np.array(idf_matrix_body)
print(idf_matrix_body.shape) # should be (7202,117023)

In [None]:
# multiplying the tf and idf terms for body to form the final tf-idf matrix
tfidf_matrix_body = np.multiply(tf_matrix,idf_matrix)
print(tfidf_matrix_body.shape) # should be (7202,117023)

In [None]:
# combining to form the final statistical matrix
stat_bn = np.concatenate((tfidf_matrix_headline,tfidf_matrix_body),axis=1)
print(stat_bn.shape) # should be (7202,129315)

In [None]:
# save the final bengali statistical array
np.save(file='./Data/bengali_statistical.npy',arr=stat_bn)

# Creating External Features for Bengali Sentences

In [None]:
# Creating the complete external features array to include common word ngrams between headline-body, common character n-grams
# between headline-body and relative polarity of headline with respect to the body

bn_ext = []
i = 0
for sent1, sent2 in zip(headlines, body):
  print(i)
  i+=1
  vec = []
  for n in range(2,17):

    n_grams_1 = ngrams(sent1,n)
    n_grams_2 = ngrams(sent2,n)
    a = len(list(set(n_grams_1).intersection(n_grams_2)))
    vec.append(a)

  for n in range(2,7):
    n_grams_1 = ngrams(sent1.split(),n)
    n_grams_2 = ngrams(sent2.split(),n)
    a = len(list(set(n_grams_1).intersection(n_grams_2)))
    vec.append(a)
    #print(a)

  flag=False

  text1 = Text(sent1)
  text2 = Text(sent2)
  pol1 = 0
  pol2 = 0

  for word in text1.words:
    try:
      pol1+=word.polarity
    except:
      flag=True
      vec.append(0)
      break

  if (flag==True):
    bn_ext.append(vec)
    continue
  flag=False
  for word in text2.words:
    try:
      pol2+=word.polarity
    except:
      flag=True
      vec.append(0)
      break

  if (flag==True):
    bn_ext.append(vec)
    continue
  
  pol1 = pol1/(len(sent1.split())*1.0)
  pol2 = pol2/(len(sent2.split())*1.0)


  vec.append(pol1-pol2)
  bn_ext.append(vec)

bn_ext = np.array(bn_ext)
print(bn_ext.shape)# should be (7202,21)

In [None]:
# save the external features bengali array
np.save(arr=bn_ext,file='./Data/bengali_external.npy')