In [2]:
# from utils import compute_similarity
import time
# from preprocess import remove_special_chars, rm_stopwords_stem_lowfreq, fetch_low_freq_words
# from tfidf_feature_extraction import calc_freq_distr, calc_idf, calc_tf, calc_tf_idf, compute_para_similarity, compute_similarity_centroid
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import sys

EMPTY_INP_SIMILARITY = 'Invalid inputs to the function compute_similarity.'



In [3]:

INVALID_INP_TFIDF = 'Error in TF-IDF Calculation. Invalid Input arrays.'
INVALID_INP_SIMILARITY = 'Error while computing compute paragraph similarity. Invalid inputs.'
EMPTY_INP_FREQ = 'Error while calculating frequency distribution of words. Invalid inputs.'
INVALID_INP_IDF = 'Error while calculating IDF. Invalid inputs.'
INVALID_INP_TF = 'Error while calculating the Term Frequency. Invalid inputs.'
INVALID_INP_CENTRE = 'Error while computing centroid. Invalid inputs.'

"""
Calculate the frequency distribution of each word in a paragraph.
Outputs an array of the frequency distribution for each paragraph.
"""
def calc_freq_distr(input_paragraphs, word_corpus):
    
    if input_paragraphs.size == 0 or word_corpus.size == 0:
        raise ValueError(EMPTY_INP_FREQ)

    try:
        freq_distribution = np.zeros((len(input_paragraphs), len(word_corpus)))
        
        for p in range(len(input_paragraphs)):
            paragraph = input_paragraphs[p].split()
            for w in range(len(word_corpus)):
                freq_distribution[p,w] = paragraph.count(word_corpus[w])
        return freq_distribution

    except Exception as e:
        print('Error: ', e)
        print('Error while calculating frequency distribution of words.')


"""
calc_idf - Calculate and returns the Inverse Document Frequency score. This function takes frequency_distribution dictionary as input.

IDF definition:
    IDF(t) = log(N+1/df(t)+1) + 1
    where t is each word in the word corpus (feature)
    N is the number of paragraphs in the document
    df(t) is the count of documents in which the word appears. 
    
    An extra term 1 has been added to numerator and denominator to avoid divide by zero error. 
    It is equivalent to adding an extra paragraph which contains every word exactly once.
"""
def calc_idf(freq_distribution):
    
    if freq_distribution.size == 0:
        raise ValueError(INVALID_INP_IDF)

    try:
        n_paragraphs, n_words = freq_distribution.shape
        word_distribution = np.array([np.count_nonzero(freq_distribution[:,w]) for w in range(n_words)]).reshape(n_words, 1)
        doc_count = np.zeros((word_distribution.shape))  + float(n_paragraphs)
        idf = np.log(np.divide(1+doc_count,1+word_distribution)).transpose() + 1
        return idf

    except Exception as e:
        print('Error: ', e)
        print('Error while computing IDF.')


"""
calc_tf - Calculate and returns the Term Frequency score. This function takes frequency_distribution array as input.
TF formula:
    TF(t) = Count of each word in the paragraph / Total number of words in the paragraph
"""
def calc_tf(freq_distribution):
    if freq_distribution.size == 0:
        raise ValueError(INVALID_INP_TF)
    try:    
        word_count = np.repeat(np.sum(freq_distribution, axis = 1).reshape(freq_distribution.shape[0], 1), repeats = freq_distribution.shape[1], axis = 1)
        tf = np.divide(freq_distribution, word_count)
        return tf

    except Exception as e:
        print('Error: ', e)
        print('Error while calculating Term Frequency.')


"""
calc_tf_idf - Calculate TF-TDF of each word in the document.
Inputs:
1. Array of Term frequency (TF) of each paragraph in the document
2. IDF array of document.
Output: 
Returns a TF-IDF array.
"""
def calc_tf_idf(tf, idf):
    
    if tf.size == 0 or idf.size == 0:
        raise ValueError(INVALID_INP_TFIDF)

    try:
        tf_idf = np.multiply(tf, idf)
        norm = np.linalg.norm(tf_idf, axis = 1).reshape(tf.shape[0], 1)
        return tf_idf/norm

    except Exception as e:
        print('Error: ', e)
        print('Error while calculating TF-IDF.')


"""
compute_para_similarity - Calculates the similarity of each paragraph.
Multiply the TF-IDF vectors of each paragraph with the centroid of the TF-IDF 
vectors of the paragraph with which you wish to compute the similarity.
Input:  array of TF-IDF values of paragraph. 
Output: Similar paragraphs
"""
def compute_para_similarity(tf_idf_array, similar_text, similarity_centroid):
    
    if tf_idf_array.size == 0 or similarity_centroid.size == 0 or len(similar_text) == 0:
        raise ValueError(INVALID_INP_SIMILARITY)
    try:
        similar_para_idx = []
        for i in range(1, tf_idf_array.shape[0]):
            if i not in similar_text and np.matmul(similarity_centroid, tf_idf_array[i, :]) > 0.35:  
                similar_para_idx.append(i)
        return similar_para_idx

    except Exception as e:
        print('Error: ', e)
        print('Error while computing paragraph similarity.')


def compute_similarity_centroid(tf_idf_array, similar_text):
    if tf_idf_array.size == 0 or len(similar_text) == 0:
        raise ValueError(INVALID_INP_CENTRE)
    try:
        similarity_centroid = np.sum(tf_idf_array[similar_text], axis = 0)/len(similar_text)
        return similarity_centroid
    except Exception as e:
        print('Error: ', e)
        print('Error while computing centroid.')


In [22]:

INVALID_STRING = 'Error while pre-processing the document. Invalid input to the function remove_special_chars.'
INVALID_LOW_FREQ = 'Error while pre-processing the document. Passed document does not contain any text.'
INVALID_PREPROCESS = 'Error while pre-processing the document. Error in the function rm_stopwords_stem_lowfreq'

"""
    remove_special_chars - to remove special characters from a string.    
"""
def remove_special_chars(paragraphs):

    if len(paragraphs) == 0 or paragraphs is None:
        raise ValueError(INVALID_STRING)    
    if(True):
        clean_string = re.sub(r"[^a-zA-Z0-9]+", ' ', paragraphs)
        clean_string = re.sub(r'\s+', ' ', clean_string)
        return clean_string.strip()


"""
    get_paragraphs - this function will split the data into sentences
"""
def fetch_low_freq_words(paragraphs):

    if paragraphs.size == 0:
        raise ValueError(INVALID_LOW_FREQ)    

    if(True):
        paragraphs = np.hstack(np.char.split(paragraphs))
        unique, count = np.unique(paragraphs, return_counts=True)
        word_count = np.asarray((unique, count)).T
        low_freq_words = word_count[np.where(word_count[:,1].astype(int) < 2),0]
        return low_freq_words



"""
    rm_stopwords_stem_lowfreq - to remove the stop words, low frequency words and perform stemming.
"""
def rm_stopwords_stem_lowfreq(paragraphs, low_freq_words):

    if paragraphs.size == 0:
        raise ValueError(INVALID_PREPROCESS)

    if(True):
        ps = PorterStemmer()
#         stop_words = set(stopwords.words('english'))
        tokenized_paragraphs = np.char.split(paragraphs)

        for p in range(tokenized_paragraphs.shape[0]):
            paragraphs[p] = ' '.join([ps.stem(word) for word in tokenized_paragraphs[p] if
                            len(word) > 2 and word not in low_freq_words])
        return paragraphs
    


In [23]:
"""
    This function is a wrapper which calls the function to pre-process the data.
"""
def preprocess_data(paragraph_list):
    
    
    time_in = time.time()
    paragraphs = np.array(paragraph_list)

    low_freq_words = fetch_low_freq_words(paragraphs)

    for p in range(paragraphs.shape[0]):
        if len(paragraphs[p]) == 0 or paragraphs[p] is None:
            continue;
        paragraphs[p] = remove_special_chars(paragraphs[p])
    
    clean_text = rm_stopwords_stem_lowfreq(paragraphs, low_freq_words)
#     print(clean_text)
    print('Time taken to pre-process the data: ', time.time() - time_in)
    return clean_text
    

In [24]:
"""
This function returns the index of the similar paragraphs.
"""
def compute_similarity(paragraph_list, similar_text_idx):
    if len(paragraph_list) == 0 or len(similar_text_idx) == 0:
        raise ValueError(EMPTY_INP_SIMILARITY)
    
    if(True):
        clean_text = preprocess_data(paragraph_list)
        print("Clean Text : ", clean_text)
        word_corpus = np.hstack(np.char.split(clean_text))
    
        # Calculate Word Frequency in the document
        freq_distribution = calc_freq_distr(clean_text, word_corpus)

        # Calculate Inverse Document Frequency
        word_idf = calc_idf(freq_distribution)
        
        # Calculate TF term
        word_tf = calc_tf(freq_distribution)

        # Calculate TF-IDF 
        word_tf_idf = calc_tf_idf(word_tf, word_idf)

        if len(similar_text_idx) > 1:
            similarity_centroid = compute_similarity_centroid(word_tf_idf, similar_text_idx)
        else:
            similarity_centroid = word_tf_idf[similar_text_idx[0]] 

        # Check Similarity 
        similar_para_idx = compute_para_similarity(word_tf_idf, similar_text_idx, similarity_centroid)
        
        return similar_para_idx
    


In [41]:
import json
with open('ak.txt', 'rb') as content_file:
    content = content_file.read().decode(errors='replace')
    print(len(content.split("\n")))
ids = compute_similarity(content.split("\n"), [0, 1, 3, 4, 8, 19, 24])

data = {}

# data["index"] = [0]
# data["paras"] = content.split("\n")
# with open('data.json', 'w') as outfile:  
#     json.dump(data, outfile)
print(ids)
# for i in ids:
#     print(i)

25
Time taken to pre-process the data:  0.032076358795166016
Clean Text :  ['the natur languag process gener the 1950 work can found from period 1950 ture intellig which now the ture intellig'
 'the translat more than into english the that year machin translat problem howev wa much slower and the 1966 which found that ten year long research had the expect for machin translat wa reduc research machin translat wa the late when the statist machin translat system were develop'
 'some natur languag process system develop the were shrdlu natur languag system restrict block world with restrict vocabulari and eliza psychotherapist written and 1966 inform human emot eliza human like interact the patient the veri base eliza respons for head hurt with whi head hurt'
 'dure the 1970 mani conceptu ontolog which world inform into understand data are schank 1975 cullingford 1978 wilenski 1978 meehan 1976 lehnert 1977 carbonel 1979 and lehnert 1981 dure thi mani were written parri racter and jabberwac

