In [2]:
import nltk
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from math import log
import csv
import string
import re
import pandas as pd
import numpy as np
from collections import Counter

In [3]:
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package stopwords to C:\Users\Zain
[nltk_data]     Abbas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to C:\Users\Zain
[nltk_data]     Abbas\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

# Text Data Processing Toolkit

This toolkit provides functions to process text data extracted from research papers. It includes functions for tokenization, cleaning, and segmentation of text.

## Functions

### 1. `remove_stopwords(tokens)`

- **Purpose**: Removes stopwords and punctuation from a list of tokens.
- **Input**: 
  - `tokens`: List of tokens to be processed.
- **Output**:
  - `filtered_tokens`: List of tokens with stopwords and punctuation removed.

### 2. `extract_and_index_data(directory)`

- **Purpose**: Extracts and indexes data from text files in a directory.
- **Input**:
  - `directory`: Path to the directory containing text files.
- **Output**:
  - Categorized tokens:
    - `token1`: Tokens of length 3-14 characters.
    - `token_of_len_2`: Tokens of length 2 characters.
    - `token_as_sentence`: Tokens longer than 14 characters.
    - `token_have_hyphen`: Tokens containing hyphens.
    - `token_have_punctuation`: Tokens containing punctuation.
    - `all_token`: All other tokens.
    - `all_number`: Tokens consisting only of numbers.

### 3. `cleaning_pipeline(token)`

- **Purpose**: Further cleans tokenized data by splitting tokens with multiple words, removing non-alphabetic characters, and filtering tokens based on their length and presence in the English dictionary.
- **Input**:
  - `token`: List of tokens to be cleaned.
- **Output**:
  - `tokens_no_further_processing_required`: Cleaned tokens not requiring further processing.
  - `clean_token`: Cleaned tokens after initial processing.
  - `token_contain_only_numbers`: Tokens containing only numbers.

### 4. `token_seperator(token)`

- **Purpose**: Separates concatenated tokens using dynamic programming and further cleans the tokens.
- **Input**:
  - `token`: List of tokens to be separated.
- **Output**:
  - `tokens_no_further_processing_required`: Cleaned tokens not requiring further processing.
  - `clean_token`: Cleaned tokens after initial processing.
  - `token_contain_only_numbers`: Tokens containing only numbers.

### 5. `remove_conjunctions_from_sets(input_list)`

- **Purpose**: Removes common conjunctions from a list of tokens.
- **Input**:
  - `input_list`: List of tokens to be processed.
- **Output**:
  - `filtered_list`: List of tokens with common conjunctions removed.

### 6. `word_segment(sentences)`

- **Purpose**: Segments words from sentences, handling cases of concatenated words and filtering out stopwords and short words.
- **Input**:
  - `sentences`: List of sentences to be processed.
- **Output**:
  - Segmented words.


In [4]:
stop_words = set(["a", "is", "the", "of", "all", "and", "to", "can", "be", "as", "once", "for", "at", "am", "are", "has", "have", "had", "up", "his", "her", "in", "on", "no", "we", "do"])
punctuation = set(['!', '\\', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '—'])
english_words = set(nltk.corpus.words.words())
porter = PorterStemmer()


#remove_stopwords, simply remove the stopwords and the punctuatated tokens
def remove_stopwords(tokens):    
    punctuation = set(string.punctuation)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word not in punctuation]
    return filtered_tokens

def extract_and_index_data(directory):
    
    token1 = []
    token_of_len_2 = []
    token_as_sentence = []
    token_have_hyphen = []
    token_have_punctuation = []
    all_token = []
    all_number = []
    
    for filename in os.listdir(directory):
  
        if os.path.isfile(os.path.join(directory, filename)):
            with open(os.path.join(directory, filename), 'r') as text:
                data = text.read()

            file_tokens = word_tokenize(data)
            normalized = remove_stopwords(file_tokens)

            for token in normalized:
                if 3 <= len(token) <= 14 and all(char not in string.punctuation for char in token) and not token.isdigit():
                    token1.append((token.lower(), filename))
                elif len(token) == 2 and not token.isdigit():
                    token_of_len_2.append((token.lower(), filename))
                elif len(token) > 14 and '-' not in token and not token.isdigit():
                    token_as_sentence.append((token.lower(), filename))
                elif '-' in token:
                    token_have_hyphen.append((token.lower(), filename))
                elif any(char in string.punctuation for char in token) and '-' not in token and len(token) <= 14 and not token.isdigit():
                    token_have_punctuation.append((token.lower(), filename))
                elif token.isdigit():
                    all_number.append((token.lower(), filename))
                else:
                    all_token.append((token.lower(), filename))

    return (token1, token_of_len_2, token_as_sentence, token_have_hyphen, token_have_punctuation, all_token, all_number)


directory_path = r'D:\SEMESTER 06\INFORMATION RETRIEVAL\ASSIGNMENT II\ResearchPapers'
#function invoking
tokens= extract_and_index_data(directory_path)

token1, token_of_len_2, token_as_sentence, token_have_hyphen, token_have_punctuation, all_token, all_number = tokens

#########################   

def cleaning_pipeline(token):
    clean_token = []
    token_split = []
    new_token = []
    tokens_no_further_processing_required = []
    token_contain_only_numbers = []
    for x, y in token:
        if len(x) >= 4 and not x.isdigit() and any(c.isalpha() for c in x):
            new_x = ''.join(' ' if char in punctuation else char for char in x)
            if ' ' in new_x:
                token_split = new_x.split(' ')
                for word in token_split:
                    if len(word) >= 4:
                        new_token.append((word, y))
            else:
                if len(new_x) >= 4:
                    new_token.append((new_x, y))
        elif x.isdigit():
            token_contain_only_numbers.append((x, y))
    
    for x,y in new_token:
        if x.isdigit():
            token_contain_only_numbers.append((x,y))

    
    for x, y in new_token:
        if any(char.isdigit() for char in x):
            cleaned_x = ''.join(char if char != '—' else '' for char in x) and ''.join(char for char in x if not char.isdigit())
            split_tokens = nltk.word_tokenize(cleaned_x)
            for word in split_tokens:
                if len(word) >= 4:
                    clean_token.append((word, y))
        else:
            clean_token.append((x, y))
    tokens_no_further_processing_required = [(x,y) for x,y in clean_token if x in english_words]
    clean_token = [(x, y) for x, y in clean_token if x not in english_words]
    return tokens_no_further_processing_required,clean_token,token_contain_only_numbers

#########################################

def token_seperator(token):
    words = []
    for x in english_words:
        words.append(x)

    for x,y in token1:
        words.append(x)


    wordcost = {k: log((i + 1) * log(len(words))) for i, k in enumerate(words)}
    maxword = max(len(x) for x in words)
    # return infer_spaces(token,maxword,wordcost)
    clean_term = []
    for i in range(len(token)):
        clean_term.append((infer_spaces(token[i][0].lower(),maxword,wordcost),token[i][1]))
    return cleaning_pipeline(clean_term)
    

def infer_spaces(s,maxword,wordcost):
    def best_match(i):
        candidates = enumerate(reversed(cost[max(0, i - maxword):i]))
        return min((c + wordcost.get(s[i - k - 1:i], 9e999), k + 1) for k, c in candidates)
    
    cost = [0]
    for i in range(1, len(s) + 1):
        c, k = best_match(i)
        cost.append(c)
    
    out = []
    i = len(s)
    while i > 0:
        c, k = best_match(i)
        assert c == cost[i]
        out.append(s[i - k:i])
        i -= k
    
    return " ".join(reversed(out))
    
#######################################


def remove_conjunctions_from_sets(input_list):
    conjunctions = ['this', 'that','of', 'and', 'or', 'but', 'for', 'nor', 'so', 'yet', 'to', 'with', 'in', 'on', 'at', 'by', 'is', "the", "of", "all", "and", "to", "can", "be", "as", "once", "for", "at", "am", "are", "has", "have", "had", "up", "his", "her", "in", "on", "no"]
    filtered_list = []
    for s, x in input_list:  # Unpack the tuple
        # Check from the beginning
        for i in range(len(s)):
            first_word = s[:i].lower()
            if first_word in conjunctions:
                s = s[i:]
                break
        # Check from the end
        for i in range(len(s), 0, -1):
            last_word = s[i:].lower()
            if last_word in conjunctions:
                s = s[:i]
                break
        filtered_list.append((s, x))  # Append the modified tuple
    return filtered_list


def word_segment(sentences):
    len_3_word = []
    segmented_sentences = []
    term = []
    for sentence, info in sentences:
        words = []
        start = 0
        while start < len(sentence):
            found = False
            for end in range(len(sentence), start, -1):
                word = sentence[start:end]
                if word.lower() in english_words:
                    if word.lower().startswith("the"):
                        word = word[2:]
                    if len(word) >= 4:
                        if word.lower() not in {"a", "an", "the"} or start != 0:
                            words.append(word)
                        start = end
                        found = True
                    else:
                        len_3_word.append((word, info))
                    break
            if not found:
                start += 1
        segmented_sentences.append((words, info))

    term = [(wd.lower(), info) for wd, info in len_3_word if len(wd) >= 3 and wordnet.synsets(word, pos=wordnet.NOUN) and word.lower() not in stop_words and word not in punctuation]
    for x, y in segmented_sentences:
        for word in x:
            if word not in stop_words and word in english_words:
                term.append((word, y))
    return term

####################

# Additional Token Processing

## Hyphenated Terms and Links

Hyphenated terms and links are separated and filtered from the main token list.

- **Hyphenated Terms**:
  - Terms containing a single hyphen are extracted and checked for length and hyphen position.
  - The extracted terms are added to the `hyphen_term` list.
  
- **Links**:
  - Tokens starting with common link prefixes like 'http://', 'www', or 'org' are filtered and added to the `token_link` list.

## Further Token Filtering

Additional token filtering is performed to refine the token lists.

- **Tokens with Hyphens**:
  - Tokens separated in the previous step are removed from the main token list.

- **Tokens Considered as Sentences**:
  - Tokens resembling sentences, often starting with link prefixes, are filtered out.

## Vocabulary Check

Tokens are checked against an English vocabulary to filter out non-English words.

- **Existing in Vocabulary**:
  - Tokens found in the English vocabulary are separated into a new list (`token_exist_in_vocab`), while others remain in the `token_as_sentence` list.

## Cleaning and Processing

The remaining tokens undergo further cleaning and processing.

- **Cleaning Pipeline**:
  - The `cleaning_pipeline` function is applied to tokens categorized as sentences and tokens with punctuation, resulting in cleaned tokens.

- **Token Separation**:
  - The `token_seperator` function is used to separate tokens that might be concatenated words.

## Stemming

Finally, stemming is applied to the tokens, reducing them to their root forms.

- **Stemming**:
  - Each token is stemmed using a stemming algorithm (e.g., Porter Stemmer), and the stemmed tokens are collected in the `stem_terms` list.



In [5]:
hyphen_term = []
token_exist_in_vocab = []

token_link = [(x, y) for x, y in token_have_hyphen if x.startswith('//') or x.startswith('www') or x.startswith('org')]
token_have_hyphen = [(x, y) for x, y in token_have_hyphen if not (x.startswith('//') or x.startswith('www') or x.startswith('org'))]

hyphen_term = [(x, y) for x, y in token_have_hyphen if x.count('-') == 1 and '-' not in [x[0], x[-1]] and 4 <= len(x.split('-')[0]) + len(x.split('-')[1]) <= 20]
token_have_hyphen = [(x, y) for x, y in token_have_hyphen if (x, y) not in hyphen_term]

token_link = [(x, y) for x, y in token_as_sentence if x.startswith('//') or x.startswith('www') or x.startswith('org')]
token_as_sentence = [(x, y) for x, y in token_as_sentence if not (x.startswith('//') or x.startswith('www') or x.startswith('org'))]



token_exist_in_vocab = [(x,y) for x,y in token_as_sentence if x in english_words]
token_as_sentence = [(x, y) for x, y in token_as_sentence if x not in english_words]

token1 += token_exist_in_vocab

tokens_no_further_processing_required, clean_token, token_contain_only_numbers = cleaning_pipeline(token_as_sentence)
all_number += token_contain_only_numbers
token1 += tokens_no_further_processing_required


tokens_no_further_processing_required1, clean_token1, token_contain_only_numbers1 = cleaning_pipeline(token_have_punctuation)
all_number += token_contain_only_numbers1
token1 += tokens_no_further_processing_required1
clean_token += clean_token1

tokens_no_further_processing_required3, clean_token3, token_contain_only_numbers3 = cleaning_pipeline(token_have_hyphen)
all_number += token_contain_only_numbers3
token1 += tokens_no_further_processing_required3
clean_token += clean_token3

tokens_no_further_processing_required2, clean_token2,token_contain_only_numbers2 = token_seperator(clean_token)
token1 += tokens_no_further_processing_required2

sets_list = remove_conjunctions_from_sets(clean_token2)
token_sentences = word_segment(sets_list)

term = []

term = token_link + token1 + all_number + token_sentences + hyphen_term

stem_terms = []
for x, y in term:
    stemmed_x = porter.stem(x)
    stem_terms.append((stemmed_x,y))

In [6]:
doc = []
for x, y in stem_terms:
    if y not in doc:
        doc.append(y.split()[0])

def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower() for text in re.split('([0-9]+)', s)]
doc.sort(key=natural_sort_key)

In [7]:
inverted_index = {file: [] for file in doc}

for term, file in stem_terms:
    inverted_index[file].append(term)

In [8]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', None)
df = pd.DataFrame(inverted_index[doc[2]])
df.head()

Unnamed: 0,0
0,//portal.acm.org/citation.cfm
1,//doi.org/10.1109/cvprw.2009.5206848
2,//doi.org/10.1109/cvprw.2009.5206848
3,//doi.org/10.23919/mipro.2018.8400040
4,//www.youtube.com/watch


# BOW: BAG OF WORDS
## Bag-of-Words Representation

This section describes the creation of a Bag-of-Words (BoW) representation from the processed token data.

## Construction of BoW DataFrame

A BoW DataFrame (`bow_df`) is constructed to represent the occurrence of terms across documents.

- **DataFrame Structure**:
  - Each row represents a document.
  - Each column represents a unique stemmed term extracted from the documents.

## Generating BoW Data

The BoW DataFrame is populated with term frequencies for each document.

- **Term Frequencies**:
  - For each document (`file`), a Counter object is created to count the occurrences of terms.
  - The frequency of each term in the document is recorded in the corresponding column of the DataFrame.

## Filling Missing Values

Any missing values (NaN) in the DataFrame are filled with zeros to represent terms that do not occur in certain documents.

- **Handling Missing Values**:
  - Missing values in the DataFrame are replaced with zeros using the `fillna` method, ensuring consistent representation across all documents.



In [9]:
all_terms = sorted(set(x for x,y in stem_terms))
bow_df = pd.DataFrame(columns=all_terms, index=doc)

for file, terms in inverted_index.items():
    term_counts = Counter(terms)
    bow_df.loc[file] = [term_counts[term] for term in all_terms]

bow_df.fillna(0, inplace=True)

In [10]:
bow = bow_df.T
bow.iloc[10000:10010]
bow.iloc[2190:2200]

Unnamed: 0,1.txt,2.txt,3.txt,7.txt,8.txt,9.txt,11.txt,12.txt,13.txt,14.txt,15.txt,16.txt,17.txt,18.txt,21.txt,22.txt,23.txt,24.txt,25.txt,26.txt
71331005,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
717–727,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
71:2668-79,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
71–101,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
71–104,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
71–80,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
71–82,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
71–83,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
72,0,1,0,8,3,0,0,0,0,0,0,2,0,0,0,0,0,1,0,1
720,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0


# TF: TERM FREQUENCY

## Construction of TF Matrix

A TF matrix (`tf_matrix`) is constructed to represent the term frequencies normalized by document length.

- **Matrix Structure**:
  - Each row represents a document.
  - Each column represents a unique stemmed term extracted from the documents.

## Generating TF Data

The TF matrix is populated with term frequencies normalized by document length.

- **Normalization Formula**:
  - Term frequencies are transformed using the formula: \( 1 + \log_{10}(tf_{ij} + 1) \), where \( tf_{ij} \) is the term frequency of term \( j \) in document \( i \).

- **Normalization Process**:
  - For each term in each document, the term frequency is normalized using the logarithmic transformation.
  - The normalized term frequencies are recorded in the corresponding cells of the TF matrix.

## Filling Missing Values

Any missing values (NaN) in the TF matrix are filled with zeros.

- **Handling Missing Values**:
  - Missing values in the TF matrix are replaced with zeros using the `fillna` method, ensuring consistent representation of term frequencies.



In [11]:
tf_matrix = pd.DataFrame(columns=all_terms, index=doc)
tf_matrix = bow.copy()
for i in range(len(all_terms)):
    for j in range(20):
        tf_matrix.iloc[i,j] = 1 + np.log10(bow.iloc[i,j] + 1)

tf_matrix.fillna(0, inplace=True)

# Explanation of Term Frequency Calculation

The formula `1 + np.log10(bow.iloc[i,j] + 1)` is used to calculate the Term Frequency (TF) for each term in the Bag-of-Words (BoW) representation.

## TF Calculation Formula

The TF value for a term in a document is determined using the following formula:

\[ \text{TF}_{ij} = 1 + \log_{10}(\text{tf}_{ij} + 1) \]

- \( \text{TF}_{ij} \): Term Frequency of term \( j \) in document \( i \)
- \( \text{tf}_{ij} \): Raw frequency of term \( j \) in document \( i \)

## Explanation of Formula

1. **Addition of 1**: 
   - Adding 1 to the raw term frequency ensures that terms with zero occurrences are assigned a non-zero TF value.

2. **Logarithmic Transformation**:
   - The raw term frequency is transformed using a logarithmic function (base 10).
   - Logarithmic scaling compresses the range of term frequencies, reducing the influence of very high frequencies.

3. **Normalization**:
   - By adding 1 before taking the logarithm, the TF values are normalized, preventing extremely high frequencies from dominating the TF calculation.
   - Normalization helps balance the importance of terms within documents, especially for documents with varying lengths.

## Impact on Term Weight

- **Increase in Weight**:
  - The formula increases the weight assigned to terms based on their frequency in documents.
  - Terms occurring more frequently in a document are assigned higher TF values, reflecting their relative importance within the document.



In [12]:
bow.head()
tf_matrix.iloc[2190:2200]

Unnamed: 0,1.txt,2.txt,3.txt,7.txt,8.txt,9.txt,11.txt,12.txt,13.txt,14.txt,15.txt,16.txt,17.txt,18.txt,21.txt,22.txt,23.txt,24.txt,25.txt,26.txt
71331005,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.30103,1.0,1.0
717–727,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
71:2668-79,1.0,1.0,1.0,1.0,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
71–101,1.0,1.0,1.0,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
71–104,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
71–80,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
71–82,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
71–83,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
72,1.0,1.30103,1.0,1.954243,1.60206,1.0,1.0,1.0,1.0,1.0,1.0,1.477121,1.0,1.0,1.0,1.0,1.0,1.30103,1.0,1.30103
720,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.477121,1.0,1.0,1.0,1.0,1.0


In [13]:
tf = tf_matrix.T
tf_matrix.iloc[10000:10010]

Unnamed: 0,1.txt,2.txt,3.txt,7.txt,8.txt,9.txt,11.txt,12.txt,13.txt,14.txt,15.txt,16.txt,17.txt,18.txt,21.txt,22.txt,23.txt,24.txt,25.txt,26.txt
miami,1.0,1.0,1.30103,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
miao,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.60206,1.0,1.0,1.0,1.0,1.0,1.90309,1.0,1.0
miaoa,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.30103,1.0,1.0
miaozhang,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.30103,1.0,1.0,1.0,1.0
mich,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
michael,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.778151,1.30103,1.60206,1.0,1.0,1.0,1.0
michaelcollin,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.30103,1.0,1.0,1.0,1.0
michailidi,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.30103,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
michal,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.477121,1.0,1.0,1.0,1.0,1.0,1.0,1.0
michalski,1.0,1.0,1.0,1.477121,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# IDF: INVERSE DOCUMENT FREQUENCY

# Inverse Document Frequency (IDF) Matrix

This section describes the generation of the Inverse Document Frequency (IDF) matrix from the Bag-of-Words representation.

## Construction of IDF Matrix

An IDF matrix (`idf_matrix`) is constructed to represent the inverse document frequencies of terms.

- **Matrix Structure**:
  - The IDF matrix has a single row representing IDF values for all terms.
  - Each column represents a unique stemmed term extracted from the documents.

## Generating IDF Data

The IDF matrix is populated with IDF values calculated based on document frequencies.

- **IDF Calculation**:
  - IDF values are calculated using the formula: \( \log_{10} \left( \frac{N}{df_t} \right) \), where \( N \) is the total number of documents and \( df_t \) is the number of documents containing term \( t \).

- **IDF Calculation Process**:
  - For each term in the document, the number of documents containing the term (\( df_t \)) is counted.
  - IDF values are calculated based on the total number of documents and the document frequency of each term.
  - The calculated IDF values are recorded in the corresponding cells of the IDF matrix.



In [14]:
idf_matrix = pd.DataFrame(columns=all_terms, index=['idf'])
doc_fre = 0

for term in all_terms:
    for i in range(20):
        if bow_df[term][i] > 0:
            doc_fre += 1
    id_fre = np.log10(len(doc) / (doc_fre)) 
    idf_matrix[term] = id_fre
    doc_fre  = 0

In [15]:
idf = idf_matrix.T
idf.iloc[10000:10010]

Unnamed: 0,idf
miami,1.0
miao,1.0
miaoa,1.30103
miaozhang,1.30103
mich,1.30103
michael,0.823909
michaelcollin,1.30103
michailidi,1.0
michal,1.30103
michalski,1.30103


# COSINE SIMILARITY

## WEIGHT MATRIX
# Calculation of Weighted Term Frequencies

This section describes the calculation of weighted term frequencies using TF-IDF values.

## Construction of Weighted Matrix

A weighted matrix (`weight`) is constructed to represent the weighted term frequencies based on TF-IDF values.

- **Matrix Structure**:
  - Each row represents a document.
  - Each column represents a unique stemmed term extracted from the documents.

## Generating Weighted Data

The weighted matrix is populated with weighted term frequencies calculated using TF-IDF values.

- **Weighted Calculation**:
  - Weighted term frequencies are calculated by multiplying TF values with IDF values for each term in each document.

- **Weighted Calculation Process**:
  - For each term in each document, the TF value is multiplied by the corresponding IDF value to calculate the weighted term frequency.
  - The calculated weighted term frequencies are recorded in the corresponding cells of the weighted matrix.

## Cosine Similarity

Once the weighted matrix is constructed, cosine similarity can be calculated to measure the similarity between documents.

- **Cosine Similarity**:
  - Cosine similarity measures the cosine of the angle between two vectors, representing document representations in a high-dimensional space.
  - Higher cosine similarity values indicate greater similarity between documents, while values closer to 0 indicate dissimilarity.



In [16]:
weight = pd.DataFrame(columns=all_terms, index=doc)

for i in range(len(doc)):
    for j in range(len(all_terms)):
        weighted_value = tf.iloc[i, j] * idf.iloc[i, 0]
        weight.iloc[i, j] = weighted_value

In [17]:
weight_matrix = weight.T
weight_matrix[10000:10010]

Unnamed: 0,1.txt,2.txt,3.txt,7.txt,8.txt,9.txt,11.txt,12.txt,13.txt,14.txt,15.txt,16.txt,17.txt,18.txt,21.txt,22.txt,23.txt,24.txt,25.txt,26.txt
miami,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103
miao,1.30103,1.30103,1.0,1.0,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,2.084328,1.30103,1.30103,1.30103,1.30103,1.30103,2.475977,1.30103,1.30103
miaoa,1.30103,1.30103,1.0,1.0,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.692679,1.30103,1.30103
miaozhang,1.30103,1.30103,1.0,1.0,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.692679,1.30103,1.30103,1.30103,1.30103
mich,1.30103,1.30103,1.0,1.0,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.692679,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103
michael,1.30103,1.30103,1.0,1.0,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,2.313428,1.692679,2.084328,1.30103,1.30103,1.30103,1.30103
michaelcollin,1.30103,1.30103,1.0,1.0,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.692679,1.30103,1.30103,1.30103,1.30103
michailidi,1.30103,1.30103,1.0,1.0,1.30103,1.30103,1.30103,1.30103,1.692679,1.692679,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103
michal,1.30103,1.30103,1.0,1.0,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.921779,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103
michalski,1.30103,1.30103,1.0,1.477121,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103


## NORMALIZING
# Normalization of Weighted Term Frequencies

This section describes the normalization of weighted term frequencies in the weighted matrix.

## Normalization Process

The weighted term frequencies in the weighted matrix are normalized to ensure consistent representation across documents.

- **Normalization Formula**:
  - Each term frequency in the weighted matrix is divided by the square root of the sum of squares of all term frequencies in the corresponding column.

- **Normalization Steps**:
  1. Calculate the sum of squares of term frequencies for each term across all documents.
  2. Compute the square root of the sum of squares to obtain the normalization factor.
  3. Normalize each term frequency by dividing it by the normalization factor.

- **Normalization Result**:
  - After normalization, each term frequency represents its relative importance within the document and facilitates comparison between documents.

## Application of Normalization

Normalized weighted term frequencies enable more accurate comparison and analysis of document similarity.



In [18]:
normalized_values = []
for i in range(20):
    sum_of_squares = np.sum(weight_matrix.iloc[:, i] ** 2)
    normalized_values.append(np.sqrt(sum_of_squares))

for i in range(20):
    for j in range(len(all_terms)):
        weight_matrix.iloc[j, i] = weight_matrix.iloc[j, i] / normalized_values[i]

In [19]:
len(normalized_values)
normalized_values

[183.6902036050944,
 180.24936575267787,
 133.65144284829657,
 154.6767650174217,
 177.116707498794,
 173.54479017716977,
 170.6589589728953,
 189.368560030639,
 184.6295851150998,
 184.6295851150998,
 179.8988760705069,
 183.61205045211517,
 179.78112153013782,
 175.50482003283742,
 178.47844728591338,
 190.28086244780235,
 172.2767334312915,
 174.37966612878904,
 174.639548811969,
 179.46324612046334]

# SAVE INTO CSV FILE

In [20]:
csv_file = 'vector_space_model.csv'
weight_matrix.to_csv(csv_file)

# QUERY PROCESSING

In [21]:
def search(query):
    score = []
    q_term = []
    q_vec = {}
    doc_vec = {}
    result = {}
    magnitude = 0
    sum = 0
    
    q_token = nltk.word_tokenize(query)
    for x in q_token:
        stemmed_x = porter.stem(x)
        q_term.append(stemmed_x)

    for x in q_term:
        q_vec[x] = idf.loc[x][0]

    for x,y in q_vec.items():
        magnitude = magnitude + y**2

    for x,y in q_vec.items():
        q_vec[x] = y/np.sqrt(magnitude)

    for x,y in q_vec.items():
        doc_vec[x] = []
        for i in range(20):
            doc_vec[x].append(weight_matrix.loc[x][i])
    
    for i in range(20):
        for x,y in doc_vec.items():
            sum = sum + y[i]*(q_vec[x])
        score.append((sum,doc[i]))
    score.sort()

    for i in range(20):
        if score[i][0] < 0.05:
            # print(score[i])
            result[score[i][1]] = score[i][0]
    return result,score

In [None]:
start = True
print('FOR EXIT ENTER EMPTY STRING')
while start:
    query = input('Search: ')
    if query != "":
        result, score = search(query)
        res_df = pd.DataFrame.from_dict(result, orient='index', columns=['Rank'])
        print(res_df)
        print("\n")
    else:
        start = False

FOR EXIT ENTER EMPTY STRING


Search:  machine learning


           Rank
1.txt  0.027669




Search:  books


           Rank
1.txt  0.007083
2.txt  0.014301
3.txt  0.021783
7.txt  0.030194
8.txt  0.037540
9.txt  0.045037


