In [1]:
import re
from stemming.porter2 import stem
import math

def tokenization(content):
	word_tokens = re.sub(r'[!@#$%^&*()_+{}|:"<>?,./;\'[\]\-=]+', ' ', content).lower().split()
	return word_tokens

def stop_words(word_tokens, stop_word_read):
	stop_word_tokens = tokenization(stop_word_read)
	filter_stop = [w for w in word_tokens if w not in stop_word_tokens]
	return filter_stop

def porter_stemmer(filter_stop):
	filter_stemmer = [stem(tokens) for tokens in filter_stop]
	return filter_stemmer

def pre_processing(content, stop_word_read):
	word_tokens = tokenization(content)
	filter_stop = stop_words(word_tokens, stop_word_read)
	filter_stemmer = porter_stemmer(filter_stop)
	return filter_stemmer

In [3]:
def MI_chi(N,tokens_set,chosen_corpus,other_corpuses):

	N_1_1 = N_0_1 = N_1_0 = N_0_0 = 0


	Quran_MI = {}
	Quran_chi = {}

	for single_token in tokens_set:
		N_1_1=N_0_1=N_1_0=N_0_0=0

		for single_doc in chosen_corpus:
			if single_token in single_doc:
				N_1_1+=1
			else:
				N_0_1+=1

		for single_doc in other_corpuses:
			if single_token in single_doc:
				N_1_0+=1
			else:
				N_0_0+=1
		N_1_both = N_1_1 + N_1_0
		N_0_both = N_0_1 + N_0_0
		N_both_1 = N_1_1 + N_0_1
		N_both_0 = N_1_0 + N_0_0
		
		if(N_1_both==0 or N_both_1==0 or N_1_1==0):
			part1=0
		else:
			part1 = N_1_1/N*math.log((N*N_1_1)/(N_1_both*N_both_1),2)

		if(N_0_both==0 or N_both_1==0 or N_0_1==0):
			part2=0
		else:
			part2 = N_0_1/N*math.log((N*N_0_1)/(N_0_both*N_both_1),2)

		if(N_1_both==0 or N_both_0==0 or N_1_0==0):
			part3=0
		else:
			part3 = N_1_0/N*math.log((N*N_1_0)/(N_1_both*N_both_0),2)

		if(N_0_both==0 or N_both_0==0 or N_0_0==0):
			part4=0
		else:
			part4 = N_0_0/N*math.log((N*N_0_0)/(N_0_both*N_both_0),2)

		mutual_information = part1 + part2 + part3 + part4

		Quran_MI[single_token] = mutual_information

		if (N_both_1==0 or N_1_both==0 or N_both_0==0 or N_0_both==0):
			chi_squared=0
		else:
			chi_squared = (N_1_1 + N_1_0 + N_0_1 + N_0_0)*math.pow(N_1_1*N_0_0-N_1_0*N_0_1,2)/ \
				(N_both_1*N_1_both*N_both_0*N_0_both)

		Quran_chi[single_token] = chi_squared


	Quran_MI_list = sorted(Quran_MI.items(), key = lambda item: item[1], reverse = True)
	Quran_chi_list = sorted(Quran_chi.items(), key = lambda item: item[1], reverse = True)

	for i in range(0,10):
		print("%s,%.3f" %(Quran_MI_list[i][0], Quran_MI_list[i][1]))


	for i in range(0,10):
		print("%s,%.3f" %(Quran_chi_list[i][0], Quran_chi_list[i][1]))


In [6]:
Quran_list = []
OT_list = []
NT_list = []
num_Quran = num_OT = num_NT = 0

tokens_list = []

#extract the list of stop words
with open('englishST.txt', 'r') as f:
	stop_word_read = f.read()

with open('train_and_dev.tsv','r') as f:
	for line in f.readlines():
		corpus = line.split('\t')[0]
		content = line.split('\t')[1]
		content = pre_processing(content, stop_word_read)
		tokens_list+=content

		if(corpus == 'Quran'):
			num_Quran+=1
			Quran_list.append(content)
			#print(re.sub(r'[!@#$%^&*()_+{}|:"<>?,./;\'[\]\-=]+', ' ', content).lower().split())
		elif(corpus == 'OT'):
			num_OT+=1
			OT_list.append(content)
		elif(corpus == 'NT'):
			num_NT+=1
			NT_list.append(content)

N = num_Quran + num_OT + num_NT

tokens_set = set(tokens_list) #each token only once

#Quran corpus
print("Quran")
MI_chi(N,tokens_set,Quran_list,OT_list+NT_list)

print("----------------------------")
print("OT")
MI_chi(N,tokens_set,OT_list,Quran_list+NT_list)

print("----------------------------")
print("NT")
MI_chi(N,tokens_set,NT_list,Quran_list+OT_list)

Quran
allah,0.153
thou,0.039
thi,0.031
ye,0.028
thee,0.028
god,0.025
man,0.020
king,0.019
hath,0.019
punish,0.018
allah,7058.784
punish,917.837
thou,889.245
believ,856.012
unbeliev,811.822
messeng,769.741
god,704.642
thi,699.436
beli,683.328
guid,677.282
----------------------------
OT
allah,0.087
jesus,0.041
israel,0.036
lord,0.031
thi,0.030
king,0.029
thou,0.023
christ,0.021
thee,0.019
believ,0.017
allah,2778.575
jesus,1296.973
lord,1119.329
israel,1070.163
thi,953.891
king,884.374
thou,776.969
christ,649.054
thee,633.997
believ,600.444
----------------------------
NT
jesus,0.065
christ,0.037
allah,0.019
discipl,0.018
lord,0.016
ye,0.013
israel,0.013
faith,0.013
paul,0.012
peter,0.011
jesus,3268.989
christ,1795.001
discipl,909.800
faith,669.145
paul,588.945
ye,586.429
peter,560.751
lord,538.896
thing,525.050
receiv,490.809


In [7]:
print("----------------------------")
print("OT")
MI_chi(N,tokens_set,OT_list,Quran_list+NT_list)

----------------------------
OT
allah,0.087
jesus,0.041
israel,0.036
lord,0.031
thi,0.030
king,0.029
thou,0.023
christ,0.021
thee,0.019
believ,0.017
allah,2778.575
jesus,1296.973
lord,1119.329
israel,1070.163
thi,953.891
king,884.374
thou,776.969
christ,649.054
thee,633.997
believ,600.444


In [8]:
print("----------------------------")
print("NT")
MI_chi(N,tokens_set,NT_list,Quran_list+OT_list)

----------------------------
NT
jesus,0.065
christ,0.037
allah,0.019
discipl,0.018
lord,0.016
ye,0.013
israel,0.013
faith,0.013
paul,0.012
peter,0.011
jesus,3268.989
christ,1795.001
discipl,909.800
faith,669.145
paul,588.945
ye,586.429
peter,560.751
lord,538.896
thing,525.050
receiv,490.809
