In [180]:
from sklearn.metrics import classification_report, accuracy_score, silhouette_score, adjusted_rand_score
from sklearn.preprocessing import LabelEncoder, StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics.cluster import contingency_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from nltk.tokenize import word_tokenize
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from collections import Counter
from math import log
import pandas as pd
import numpy as np
import string
import nltk
import csv
import os
import re

In [181]:
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package stopwords to C:\Users\Zain
[nltk_data]     Abbas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to C:\Users\Zain
[nltk_data]     Abbas\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

# Text Data Processing Toolkit

This toolkit provides functions to process text data extracted from research papers. It includes functions for tokenization, cleaning, and segmentation of text.

## Functions

### 1. `remove_stopwords(tokens)`

- **Purpose**: Removes stopwords and punctuation from a list of tokens.
- **Input**: 
  - `tokens`: List of tokens to be processed.
- **Output**:
  - `filtered_tokens`: List of tokens with stopwords and punctuation removed.

### 2. `extract_and_index_data(directory)`

- **Purpose**: Extracts and indexes data from text files in a directory.
- **Input**:
  - `directory`: Path to the directory containing text files.
- **Output**:
  - Categorized tokens:
    - `token1`: Tokens of length 3-14 characters.
    - `token_of_len_2`: Tokens of length 2 characters.
    - `token_as_sentence`: Tokens longer than 14 characters.
    - `token_have_hyphen`: Tokens containing hyphens.
    - `token_have_punctuation`: Tokens containing punctuation.
    - `all_token`: All other tokens.
    - `all_number`: Tokens consisting only of numbers.

### 3. `cleaning_pipeline(token)`

- **Purpose**: Further cleans tokenized data by splitting tokens with multiple words, removing non-alphabetic characters, and filtering tokens based on their length and presence in the English dictionary.
- **Input**:
  - `token`: List of tokens to be cleaned.
- **Output**:
  - `tokens_no_further_processing_required`: Cleaned tokens not requiring further processing.
  - `clean_token`: Cleaned tokens after initial processing.
  - `token_contain_only_numbers`: Tokens containing only numbers.

### 4. `token_seperator(token)`

- **Purpose**: Separates concatenated tokens using dynamic programming and further cleans the tokens.
- **Input**:
  - `token`: List of tokens to be separated.
- **Output**:
  - `tokens_no_further_processing_required`: Cleaned tokens not requiring further processing.
  - `clean_token`: Cleaned tokens after initial processing.
  - `token_contain_only_numbers`: Tokens containing only numbers.

### 5. `remove_conjunctions_from_sets(input_list)`

- **Purpose**: Removes common conjunctions from a list of tokens.
- **Input**:
  - `input_list`: List of tokens to be processed.
- **Output**:
  - `filtered_list`: List of tokens with common conjunctions removed.

### 6. `word_segment(sentences)`

- **Purpose**: Segments words from sentences, handling cases of concatenated words and filtering out stopwords and short words.
- **Input**:
  - `sentences`: List of sentences to be processed.
- **Output**:
  - Segmented words.


In [182]:
stop_words = set(["a", "is", "the", "of", "all", "and", "to", "can", "be", "as", "once", "for", "at", "am", "are", "has", "have", "had", "up", "his", "her", "in", "on", "no", "we", "do"])
punctuation = set(['!', '\\', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '—'])
english_words = set(nltk.corpus.words.words())
porter = PorterStemmer()


#remove_stopwords, simply remove the stopwords and the punctuatated tokens
def remove_stopwords(tokens):    
    punctuation = set(string.punctuation)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word not in punctuation]
    return filtered_tokens

def extract_and_index_data(directory):
    
    token1 = []
    token_of_len_2 = []
    token_as_sentence = []
    token_have_hyphen = []
    token_have_punctuation = []
    all_token = []
    all_number = []
    
    for filename in os.listdir(directory):
  
        if os.path.isfile(os.path.join(directory, filename)):
            with open(os.path.join(directory, filename), 'r') as text:
                data = text.read()

            file_tokens = word_tokenize(data)
            normalized = remove_stopwords(file_tokens)

            for token in normalized:
                if 3 <= len(token) <= 14 and all(char not in string.punctuation for char in token) and not token.isdigit():
                    token1.append((token.lower(), filename))
                elif len(token) == 2 and not token.isdigit():
                    token_of_len_2.append((token.lower(), filename))
                elif len(token) > 14 and '-' not in token and not token.isdigit():
                    token_as_sentence.append((token.lower(), filename))
                elif '-' in token:
                    token_have_hyphen.append((token.lower(), filename))
                elif any(char in string.punctuation for char in token) and '-' not in token and len(token) <= 14 and not token.isdigit():
                    token_have_punctuation.append((token.lower(), filename))
                elif token.isdigit():
                    all_number.append((token.lower(), filename))
                else:
                    all_token.append((token.lower(), filename))

    return (token1, token_of_len_2, token_as_sentence, token_have_hyphen, token_have_punctuation, all_token, all_number)


directory_path = r'D:\SEMESTER 06\INFORMATION RETRIEVAL\ASSIGNMENT II\ResearchPapers'
#function invoking
tokens= extract_and_index_data(directory_path)

token1, token_of_len_2, token_as_sentence, token_have_hyphen, token_have_punctuation, all_token, all_number = tokens

#########################   

def cleaning_pipeline(token):
    clean_token = []
    token_split = []
    new_token = []
    tokens_no_further_processing_required = []
    token_contain_only_numbers = []
    for x, y in token:
        if len(x) >= 4 and not x.isdigit() and any(c.isalpha() for c in x):
            new_x = ''.join(' ' if char in punctuation else char for char in x)
            if ' ' in new_x:
                token_split = new_x.split(' ')
                for word in token_split:
                    if len(word) >= 4:
                        new_token.append((word, y))
            else:
                if len(new_x) >= 4:
                    new_token.append((new_x, y))
        elif x.isdigit():
            token_contain_only_numbers.append((x, y))
    
    for x,y in new_token:
        if x.isdigit():
            token_contain_only_numbers.append((x,y))

    
    for x, y in new_token:
        if any(char.isdigit() for char in x):
            cleaned_x = ''.join(char if char != '—' else '' for char in x) and ''.join(char for char in x if not char.isdigit())
            split_tokens = nltk.word_tokenize(cleaned_x)
            for word in split_tokens:
                if len(word) >= 4:
                    clean_token.append((word, y))
        else:
            clean_token.append((x, y))
    tokens_no_further_processing_required = [(x,y) for x,y in clean_token if x in english_words]
    clean_token = [(x, y) for x, y in clean_token if x not in english_words]
    return tokens_no_further_processing_required,clean_token,token_contain_only_numbers

#########################################

def token_seperator(token):
    words = []
    for x in english_words:
        words.append(x)

    for x,y in token1:
        words.append(x)


    wordcost = {k: log((i + 1) * log(len(words))) for i, k in enumerate(words)}
    maxword = max(len(x) for x in words)
    # return infer_spaces(token,maxword,wordcost)
    clean_term = []
    for i in range(len(token)):
        clean_term.append((infer_spaces(token[i][0].lower(),maxword,wordcost),token[i][1]))
    return cleaning_pipeline(clean_term)
    

def infer_spaces(s,maxword,wordcost):
    def best_match(i):
        candidates = enumerate(reversed(cost[max(0, i - maxword):i]))
        return min((c + wordcost.get(s[i - k - 1:i], 9e999), k + 1) for k, c in candidates)
    
    cost = [0]
    for i in range(1, len(s) + 1):
        c, k = best_match(i)
        cost.append(c)
    
    out = []
    i = len(s)
    while i > 0:
        c, k = best_match(i)
        assert c == cost[i]
        out.append(s[i - k:i])
        i -= k
    
    return " ".join(reversed(out))
    
#######################################


def remove_conjunctions_from_sets(input_list):
    conjunctions = ['this', 'that','of', 'and', 'or', 'but', 'for', 'nor', 'so', 'yet', 'to', 'with', 'in', 'on', 'at', 'by', 'is', "the", "of", "all", "and", "to", "can", "be", "as", "once", "for", "at", "am", "are", "has", "have", "had", "up", "his", "her", "in", "on", "no"]
    filtered_list = []
    for s, x in input_list:  # Unpack the tuple
        # Check from the beginning
        for i in range(len(s)):
            first_word = s[:i].lower()
            if first_word in conjunctions:
                s = s[i:]
                break
        # Check from the end
        for i in range(len(s), 0, -1):
            last_word = s[i:].lower()
            if last_word in conjunctions:
                s = s[:i]
                break
        filtered_list.append((s, x))  # Append the modified tuple
    return filtered_list


def word_segment(sentences):
    len_3_word = []
    segmented_sentences = []
    term = []
    for sentence, info in sentences:
        words = []
        start = 0
        while start < len(sentence):
            found = False
            for end in range(len(sentence), start, -1):
                word = sentence[start:end]
                if word.lower() in english_words:
                    if word.lower().startswith("the"):
                        word = word[2:]
                    if len(word) >= 4:
                        if word.lower() not in {"a", "an", "the"} or start != 0:
                            words.append(word)
                        start = end
                        found = True
                    else:
                        len_3_word.append((word, info))
                    break
            if not found:
                start += 1
        segmented_sentences.append((words, info))

    term = [(wd.lower(), info) for wd, info in len_3_word if len(wd) >= 3 and wordnet.synsets(word, pos=wordnet.NOUN) and word.lower() not in stop_words and word not in punctuation]
    for x, y in segmented_sentences:
        for word in x:
            if word not in stop_words and word in english_words:
                term.append((word, y))
    return term

####################

# Additional Token Processing

## Hyphenated Terms and Links

Hyphenated terms and links are separated and filtered from the main token list.

- **Hyphenated Terms**:
  - Terms containing a single hyphen are extracted and checked for length and hyphen position.
  - The extracted terms are added to the `hyphen_term` list.
  
- **Links**:
  - Tokens starting with common link prefixes like 'http://', 'www', or 'org' are filtered and added to the `token_link` list.

## Further Token Filtering

Additional token filtering is performed to refine the token lists.

- **Tokens with Hyphens**:
  - Tokens separated in the previous step are removed from the main token list.

- **Tokens Considered as Sentences**:
  - Tokens resembling sentences, often starting with link prefixes, are filtered out.

## Vocabulary Check

Tokens are checked against an English vocabulary to filter out non-English words.

- **Existing in Vocabulary**:
  - Tokens found in the English vocabulary are separated into a new list (`token_exist_in_vocab`), while others remain in the `token_as_sentence` list.

## Cleaning and Processing

The remaining tokens undergo further cleaning and processing.

- **Cleaning Pipeline**:
  - The `cleaning_pipeline` function is applied to tokens categorized as sentences and tokens with punctuation, resulting in cleaned tokens.

- **Token Separation**:
  - The `token_seperator` function is used to separate tokens that might be concatenated words.

## Stemming

Finally, stemming is applied to the tokens, reducing them to their root forms.

- **Stemming**:
  - Each token is stemmed using a stemming algorithm (e.g., Porter Stemmer), and the stemmed tokens are collected in the `stem_terms` list.



In [183]:
hyphen_term = []
token_exist_in_vocab = []

token_link = [(x, y) for x, y in token_have_hyphen if x.startswith('//') or x.startswith('www') or x.startswith('org')]
token_have_hyphen = [(x, y) for x, y in token_have_hyphen if not (x.startswith('//') or x.startswith('www') or x.startswith('org'))]

hyphen_term = [(x, y) for x, y in token_have_hyphen if x.count('-') == 1 and '-' not in [x[0], x[-1]] and 4 <= len(x.split('-')[0]) + len(x.split('-')[1]) <= 20]
token_have_hyphen = [(x, y) for x, y in token_have_hyphen if (x, y) not in hyphen_term]

token_link = [(x, y) for x, y in token_as_sentence if x.startswith('//') or x.startswith('www') or x.startswith('org')]
token_as_sentence = [(x, y) for x, y in token_as_sentence if not (x.startswith('//') or x.startswith('www') or x.startswith('org'))]


token_exist_in_vocab = [(x,y) for x,y in token_as_sentence if x in english_words]
token_as_sentence = [(x, y) for x, y in token_as_sentence if x not in english_words]

token1 += token_exist_in_vocab

tokens_no_further_processing_required, clean_token, token_contain_only_numbers = cleaning_pipeline(token_as_sentence)
all_number += token_contain_only_numbers
token1 += tokens_no_further_processing_required


tokens_no_further_processing_required1, clean_token1, token_contain_only_numbers1 = cleaning_pipeline(token_have_punctuation)
all_number += token_contain_only_numbers1
token1 += tokens_no_further_processing_required1
clean_token += clean_token1

tokens_no_further_processing_required3, clean_token3, token_contain_only_numbers3 = cleaning_pipeline(token_have_hyphen)
all_number += token_contain_only_numbers3
token1 += tokens_no_further_processing_required3
clean_token += clean_token3

tokens_no_further_processing_required2, clean_token2,token_contain_only_numbers2 = token_seperator(clean_token)
token1 += tokens_no_further_processing_required2

sets_list = remove_conjunctions_from_sets(clean_token2)
token_sentences = word_segment(sets_list)

term = []

term = token_link + token1 + all_number + token_sentences + hyphen_term

stem_terms = []
for x, y in term:
    stemmed_x = porter.stem(x)
    stem_terms.append((stemmed_x,y))

In [184]:
doc_names = ['1.txt', '2.txt', '3.txt', '7.txt', '8.txt', '9.txt', '11.txt', '12.txt', '13.txt', '14.txt', '15.txt', '16.txt', '17.txt', '18.txt', '21.txt', '22.txt', '23.txt', '24.txt', '25.txt', '26.txt']
print(len(doc_names))
doc = [int(re.search(r'\d+', doc_name).group()) for doc_name in doc_names]

doc.sort()
print(doc)

20
[1, 2, 3, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26]


In [185]:
inverted_index = {file: [] for file in doc}
for term, file in stem_terms:
    file_idx = int(file[0].split('.')[0])
    inverted_index[file_idx].append(term)

In [186]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', None)
df = pd.DataFrame(inverted_index.items(), columns=['File', 'Terms'])

df.head()

Unnamed: 0,File,Terms
0,1,"[//doi.org/10.1002/widm.1391, //wires.onlineli..."
1,2,"[//featureselection.asu.edu/, //doi.org/10.114..."
2,3,"[//portal.acm.org/citation.cfm, //doi.org/10.1..."
3,7,"[//www.fairmlbook.org, //kilthub.cmu.edu/artic..."
4,8,"[//doi.org/10.36628/ijhf.2023.0050, //doi.org/..."


# BOW: BAG OF WORDS
## Bag-of-Words Representation

This section describes the creation of a Bag-of-Words (BoW) representation from the processed token data.

## Construction of BoW DataFrame

A BoW DataFrame (`bow_df`) is constructed to represent the occurrence of terms across documents.

- **DataFrame Structure**:
  - Each row represents a document.
  - Each column represents a unique stemmed term extracted from the documents.

## Generating BoW Data

The BoW DataFrame is populated with term frequencies for each document.

- **Term Frequencies**:
  - For each document (`file`), a Counter object is created to count the occurrences of terms.
  - The frequency of each term in the document is recorded in the corresponding column of the DataFrame.

## Filling Missing Values

Any missing values (NaN) in the DataFrame are filled with zeros to represent terms that do not occur in certain documents.

- **Handling Missing Values**:
  - Missing values in the DataFrame are replaced with zeros using the `fillna` method, ensuring consistent representation across all documents.



In [187]:
all_terms = sorted(set(x for x,y in stem_terms))
bow_df = pd.DataFrame(columns=all_terms, index=doc)

for file, terms in inverted_index.items():
    term_counts = Counter(terms)
    bow_df.loc[file] = [term_counts[term] for term in all_terms]

bow_df.fillna(0, inplace=True)

In [188]:
bow = bow_df.T
bow.iloc[10000:10010]
bow.iloc[2190:2200]

Unnamed: 0,1,2,3,7,8,9,11,12,13,14,15,16,17,18,21,22,23,24,25,26
71331005,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
717–727,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
71:2668-79,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
71–101,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
71–104,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
71–80,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
71–82,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
71–83,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
72,2,3,0,8,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
720,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# TF: TERM FREQUENCY

## Construction of TF Matrix

A TF matrix (`tf_matrix`) is constructed to represent the term frequencies normalized by document length.

- **Matrix Structure**:
  - Each row represents a document.
  - Each column represents a unique stemmed term extracted from the documents.

## Generating TF Data

The TF matrix is populated with term frequencies normalized by document length.

- **Normalization Formula**:
  - Term frequencies are transformed using the formula: \( 1 + \log_{10}(tf_{ij} + 1) \), where \( tf_{ij} \) is the term frequency of term \( j \) in document \( i \).

- **Normalization Process**:
  - For each term in each document, the term frequency is normalized using the logarithmic transformation.
  - The normalized term frequencies are recorded in the corresponding cells of the TF matrix.

## Filling Missing Values

Any missing values (NaN) in the TF matrix are filled with zeros.

- **Handling Missing Values**:
  - Missing values in the TF matrix are replaced with zeros using the `fillna` method, ensuring consistent representation of term frequencies.



In [189]:
tf_matrix = pd.DataFrame(columns=all_terms, index=doc)
tf_matrix = bow.copy()
for i in range(len(all_terms)):
    for j in range(20):
        tf_matrix.iloc[i,j] = 1 + np.log10(bow.iloc[i,j] + 1)

tf_matrix.fillna(0, inplace=True)

# Explanation of Term Frequency Calculation

The formula `1 + np.log10(bow.iloc[i,j] + 1)` is used to calculate the Term Frequency (TF) for each term in the Bag-of-Words (BoW) representation.

## TF Calculation Formula

The TF value for a term in a document is determined using the following formula:

\[ \text{TF}_{ij} = 1 + \log_{10}(\text{tf}_{ij} + 1) \]

- \( \text{TF}_{ij} \): Term Frequency of term \( j \) in document \( i \)
- \( \text{tf}_{ij} \): Raw frequency of term \( j \) in document \( i \)

## Explanation of Formula

1. **Addition of 1**: 
   - Adding 1 to the raw term frequency ensures that terms with zero occurrences are assigned a non-zero TF value.

2. **Logarithmic Transformation**:
   - The raw term frequency is transformed using a logarithmic function (base 10).
   - Logarithmic scaling compresses the range of term frequencies, reducing the influence of very high frequencies.

3. **Normalization**:
   - By adding 1 before taking the logarithm, the TF values are normalized, preventing extremely high frequencies from dominating the TF calculation.
   - Normalization helps balance the importance of terms within documents, especially for documents with varying lengths.

## Impact on Term Weight

- **Increase in Weight**:
  - The formula increases the weight assigned to terms based on their frequency in documents.
  - Terms occurring more frequently in a document are assigned higher TF values, reflecting their relative importance within the document.



In [190]:
bow.head()
tf_matrix.iloc[2190:2200]

Unnamed: 0,1,2,3,7,8,9,11,12,13,14,15,16,17,18,21,22,23,24,25,26
71331005,1.0,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
717–727,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
71:2668-79,1.0,1.0,1.0,1.0,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
71–101,1.0,1.0,1.0,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
71–104,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
71–80,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
71–82,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
71–83,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
72,1.477121,1.60206,1.0,1.954243,1.60206,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
720,1.30103,1.477121,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [191]:
tf = tf_matrix.T
tf_matrix.iloc[10000:10010]

Unnamed: 0,1,2,3,7,8,9,11,12,13,14,15,16,17,18,21,22,23,24,25,26
mich,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
michael,1.778151,1.69897,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
michaelcollin,1.0,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
michailidi,1.477121,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
michal,1.477121,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
michalski,1.0,1.0,1.0,1.477121,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
michel,1.0,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
michi,1.477121,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
michigan,1.0,1.845098,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
micro,1.0,1.477121,1.30103,1.30103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# IDF: INVERSE DOCUMENT FREQUENCY

# Inverse Document Frequency (IDF) Matrix

This section describes the generation of the Inverse Document Frequency (IDF) matrix from the Bag-of-Words representation.

## Construction of IDF Matrix

An IDF matrix (`idf_matrix`) is constructed to represent the inverse document frequencies of terms.

- **Matrix Structure**:
  - The IDF matrix has a single row representing IDF values for all terms.
  - Each column represents a unique stemmed term extracted from the documents.

## Generating IDF Data

The IDF matrix is populated with IDF values calculated based on document frequencies.

- **IDF Calculation**:
  - IDF values are calculated using the formula: \( \log_{10} \left( \frac{N}{df_t} \right) \), where \( N \) is the total number of documents and \( df_t \) is the number of documents containing term \( t \).

- **IDF Calculation Process**:
  - For each term in the document, the number of documents containing the term (\( df_t \)) is counted.
  - IDF values are calculated based on the total number of documents and the document frequency of each term.
  - The calculated IDF values are recorded in the corresponding cells of the IDF matrix.



In [192]:
doc_freq = (bow_df > 0).sum(axis=0)

idf_values = np.log10(len(doc) / doc_freq)

idf_matrix = pd.DataFrame([idf_values], columns=all_terms, index=['idf'])

In [193]:
idf = idf_matrix.T
idf.iloc[2000:2010]

Unnamed: 0,idf
559–583,1.30103
55–60,1.30103
55–79,1.30103
56,0.69897
561–563,1.30103
563,1.30103
563–572,1.30103
565–566,1.30103
567–568,1.30103
567–570,1.30103


In [194]:
idf.to_csv('idf.csv')

# COSINE SIMILARITY

## WEIGHT MATRIX
# Calculation of Weighted Term Frequencies

This section describes the calculation of weighted term frequencies using TF-IDF values.

## Construction of Weighted Matrix

A weighted matrix (`weight`) is constructed to represent the weighted term frequencies based on TF-IDF values.

- **Matrix Structure**:
  - Each row represents a document.
  - Each column represents a unique stemmed term extracted from the documents.

## Generating Weighted Data

The weighted matrix is populated with weighted term frequencies calculated using TF-IDF values.

- **Weighted Calculation**:
  - Weighted term frequencies are calculated by multiplying TF values with IDF values for each term in each document.

- **Weighted Calculation Process**:
  - For each term in each document, the TF value is multiplied by the corresponding IDF value to calculate the weighted term frequency.
  - The calculated weighted term frequencies are recorded in the corresponding cells of the weighted matrix.

## Cosine Similarity

Once the weighted matrix is constructed, cosine similarity can be calculated to measure the similarity between documents.

- **Cosine Similarity**:
  - Cosine similarity measures the cosine of the angle between two vectors, representing document representations in a high-dimensional space.
  - Higher cosine similarity values indicate greater similarity between documents, while values closer to 0 indicate dissimilarity.



In [195]:
df = pd.DataFrame(columns=all_terms, index=doc)

for i in range(len(doc)):
    for j in range(len(all_terms)):
        weighted_value = tf.iloc[i, j] * idf.iloc[i, 0]
        df.iloc[i, j] = weighted_value

In [196]:
df.to_csv('weight.csv')

In [197]:
weight = pd.read_csv('weight.csv', index_col=0)

In [198]:
weight.shape

(20, 16660)

In [199]:
tf_idf = weight.T
tf_idf[10000:10010]

Unnamed: 0,1,2,3,7,8,9,11,12,13,14,15,16,17,18,21,22,23,24,25,26
mich,1.692679,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103
michael,2.313428,2.210411,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103
michaelcollin,1.30103,1.692679,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103
michailidi,1.921779,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103
michal,1.921779,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103
michalski,1.30103,1.30103,1.30103,1.921779,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103
michel,1.30103,1.692679,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103
michi,1.921779,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103
michigan,1.30103,2.400528,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103
micro,1.30103,1.921779,1.692679,1.692679,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103,1.30103


# SUPERVISED CLASSIFICATION

## CLASSIFICATION (TARGET VARIABLE)

In [200]:
weight = weight.assign(classes=0)

In [201]:
classes = {
    "Explainable Artificial Intelligence": [1, 2, 3, 7],
    "Heart Failure": [8, 9, 11],
    "Time Series Forecasting": [12, 13, 14, 15, 16],
    "Transformer Model": [17, 18, 21],
    "Feature Selection": [22, 23, 24, 25, 26]
}

targets = []
for doc_id in tf_idf.columns:
    for class_name, doc_list in classes.items():
        if doc_id in doc_list:
            targets.append(class_name)
            break

target_labels = pd.Categorical(targets).codes

target_labels

array([0, 0, 0, 0, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 1, 1, 1, 1, 1],
      dtype=int8)

In [202]:
for index, target_label in zip(weight.index, target_labels):
    weight.at[index, 'classes'] = target_label

In [203]:
weight['classes']

1     0
2     0
3     0
7     0
8     2
9     2
11    2
12    3
13    3
14    3
15    3
16    3
17    4
18    4
21    4
22    1
23    1
24    1
25    1
26    1
Name: classes, dtype: int64

## FEATURES SELECTION

## Feature Selection and Preprocessing Pipeline

This README provides an in-depth understanding of the feature selection and preprocessing pipeline implemented in the code.

### Code Overview:

1. **Data Preparation**:
   - The input data consists of features stored in the DataFrame `weight`. Features are stored in columns, with the target variable ('classes') separated.

2. **Missing Value Imputation**:
   - Missing values in the input features (`X`) are imputed using the mean strategy through `SimpleImputer`.

3. **Feature Scaling**:
   - The imputed features are then scaled using Min-Max scaling via `MinMaxScaler`. This ensures that all features are on the same scale, preventing any particular feature from dominating the analysis due to its larger magnitude.

4. **Feature Selection**:
   - Feature selection is performed using the chi-square test (`SelectKBest` with `chi2` score function). It selects the top `k=1000` features based on their chi-square scores, which measure the dependency between each feature and the target variable ('classes').
   - The selected features are stored in `X_selected`.

### Usage:

1. **Input Data**:
   - Ensure that the input data (`weight`) contains features along with the target variable ('classes').

2. **Execution**:
   - Execute the provided code to perform missing value imputation, feature scaling, and feature selection.
   - The resulting selected features (`X_selected`) can be used for further analysis or modeling.

In [204]:
X = weight.drop(['classes'], axis=1)
y = weight['classes']

imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_imputed)

selector = SelectKBest(score_func=chi2, k=1000)
X_selected = selector.fit_transform(X_scaled, y)

## KNN CLASSIFICATION 

In [205]:
k = 5 
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_selected, y)

# UNSPUERVISED CLASSIFICATION



## Clustering Evaluation with K-Means

This code evaluates the performance of a K-Means clustering algorithm using three evaluation metrics: Purity, Silhouette Score, and Random Index.

### Code Overview:

1. **K-Means Clustering**:
   - Utilizes the K-Means algorithm to cluster data into `k=5` clusters (`n_clusters=k`).

2. **Evaluation Metrics**:
   - **Purity**: Measures the homogeneity of clusters regarding class membership.
   - **Silhouette Score**: Reflects the separation and compactness of clusters.
   - **Random Index**: Measures the similarity between predicted and actual clusterings.

3. **Functions**:
   - `compute_purity(true_labels, cluster_labels)`: Computes the purity score based on true class labels and cluster labels.
   - `compute_random_index(true_labels, cluster_labels)`: Computes the random index based on true class labels and cluster labels.

4. **Usage**:
   - Ensure the input data (`X_selected`) and true class labels (`true_labels`) are properly defined.
   - Execute the code to obtainEADME with additional details or instructions!

In [206]:
k = 5 
kmeans = KMeans(n_clusters=k, random_state=42)
cluster_labels = kmeans.fit_predict(X_selected)

  super()._check_params_vs_input(X, default_n_init=10)


In [207]:
def compute_purity(true_labels, cluster_labels):
    cm = contingency_matrix(true_labels, cluster_labels)
    purity = np.sum(np.amax(cm, axis=0)) / np.sum(cm)
    return purity

silhouette = silhouette_score(X, cluster_labels)

def compute_random_index(true_labels, cluster_labels):
    cm = contingency_matrix(true_labels, cluster_labels)
    a = np.sum(np.square(cm))
    b = np.sum(np.square(np.sum(cm, axis=0))) - a
    c = np.sum(np.square(np.sum(cm, axis=1))) - a
    d = np.sum(np.square(np.sum(cm)) - (a + b + c))
    rand_index = (a + d) / (a + b + c + d)
    return rand_index

true_labels = weight['classes']

## Clustering Evaluation on Limited Data

This README outlines the evaluation of clustering performance using a limited dataset. Despite the constraints of limited data, the clustering algorithm demonstrates promising results.

### Evaluation Metrics:

1. **Purity (0.55)**:
   - Purity measures the homogeneity of clusters regarding class membership.
   - A score of 0.55 indicates moderate purity, suggesting some mixing of classes within clusters.

2. **Silhouette Score (0.6806)**:
   - Reflects the separation and compactness of clusters.
   - A value of 0.6806 indicates well-separated and compact clusters, despite the limited data.

3. **Random Index (0.625)**:
   - Measures the similarity between predicted and actual clusterings.
   - A score of 0.625 signifies significant agreement between the predicted and actual clusters.

### Overall Assessment:

- The clustering algorithm demonstrates promising performance given the limited data.
- Despite moderate purity, the Silhouette Score and Random Index indicate well-separated clusters with substantial agreement between predicted and actual clusters.
- These results suggest the algorithm's ability to effectively identify patterns and group data points, even with a smaller dataset.

### Next Steps:

- Further data collection or augmentation could potentially improve cluster homogeneity (purity).
- Experimentation with different clustering algorithms or parameter tuning may enhance performance further.

### Dependencies:

- Data (limited dataset)


In [208]:
print("Purity:", compute_purity(true_labels, cluster_labels))
print("Silhouette Score:", silhouette)
print("Random Index:", compute_random_index(true_labels, cluster_labels))

Purity: 0.55
Silhouette Score: 0.6806129558015834
Random Index: 0.625


In [209]:
weight_word = pd.read_csv('weight.csv')

In [210]:
weight_word.shape

(20, 16661)

# ADHOC PROCESSING

## TEST DOC ~ Cleaning and Preprocessing Pipeline

This pipeline processes text data for natural language processing tasks. It includes several functions to clean and preprocess documents, including tokenization, removing conjunctions, word segmentation, and stemming.

### Functions:

1. **cleaning_pipeline(token_)**: Cleans tokens by removing punctuation, splitting words, and categorizing numbers separately.
   
2. **token_seperator(token_)**: Separates tokens into individual words using an inferred spaces approach.

3. **infer_spaces(s_, maxword_, wordcost_)**: Infers spaces between words in a tokenized string.

4. **remove_conjunctions_from_sets(input_list_)**: Removes conjunctions from sets of words.

5. **word_segment(sentences_)**: Segments sentences into words, handling cases of len(3) words and stopwords.

6. **preprocess_document(new_document_)**: Integrates all cleaning and preprocessing steps to process a document. Returns a list of stemmed terms.


In [211]:
def cleaning_pipeline_(token_):
    tokens_no_further_processing_required_ = []
    token_contain_only_numbers_ = []
    
    for x_ in token_:
        if len(x_) >= 4 and not x_.isdigit() and any(c_.isalpha() for c_ in x_):
            new_x_ = ''.join(' ' if char_ in punctuation else char_ for char_ in x_)
            if ' ' in new_x_:
                token_split_ = new_x_.split(' ')
                for word_ in token_split_:
                    if len(word_) >= 4:
                        tokens_no_further_processing_required_.append(word_)
            else:
                if len(new_x_) >= 4:
                    tokens_no_further_processing_required_.append(new_x_)
        elif x_.isdigit():
            token_contain_only_numbers_.append(x)
    
    return tokens_no_further_processing_required_, [], token_contain_only_numbers_



def token_seperator_(token_):
    words_ = list(english_words)

    wordcost_ = {k: log((i + 1) * log(len(words_))) for i, k in enumerate(words_)}
    maxword_ = max(len(x_) for x_ in words_)
    clean_term_ = []
    for i_ in range(len(token_)):
        clean_term_.append((infer_spaces(token_[i_][0].lower(), maxword_, wordcost_)))
    return cleaning_pipeline(clean_term_)
    

def infer_spaces_(s_, maxword_, wordcost_):
    def best_match(i_):
        candidates = enumerate(reversed(cost[max(0, i_ - maxword_):i_]))
        return min((c + wordcost_.get(s_[i_ - k_ - 1:i_], 9e999), k_ + 1) for k_, c in candidates)
    
    cost = [0]
    for i_ in range(1, len(s_) + 1):
        c, k_ = best_match(i_)
        cost.append(c)
    
    out = []
    i_ = len(s_)
    while i_ > 0:
        c, k_ = best_match(i_)
        assert c == cost[i_]
        out.append(s_[i_ - k_:i_])
        i_ -= k_
    
    return " ".join(reversed(out))
    
#######################################

def remove_conjunctions_from_sets_(input_list_):
    conjunctions_ = ['this', 'that','of', 'and', 'or', 'but', 'for', 'nor', 'so', 'yet', 'to', 'with', 'in', 'on', 'at', 'by', 'is', "the", "of", "all", "and", "to", "can", "be", "as", "once", "for", "at", "am", "are", "has", "have", "had", "up", "his", "her", "in", "on", "no"]
    filtered_list_ = []
    for s_ in input_list_:  # Unpack the tuple
        # Check from the beginning
        for i_ in range(len(s_)):
            first_word_ = s_[:i_].lower()
            if first_word_ in conjunctions_:
                s_ = s_[i_:]
                break
        # Check from the end
        for i_ in range(len(s_), 0, -1):
            last_word_ = s_[i_:].lower()
            if last_word_ in conjunctions_:
                s_ = s_[:i_]
                break
        filtered_list_.append(s_)  # Append the modified tuple
    return filtered_list_


def word_segment_(sentences_):
    len_3_word_ = []
    segmented_sentences_ = []
    term_ = []
    for sentence_ in sentences_:
        words_ = []
        start_ = 0
        while start_ < len(sentence_):
            found_ = False
            for end_ in range(len(sentence_), start_, -1):
                word_ = sentence_[start_:end_]
                if word_.lower() in english_words:
                    if word_.lower().startswith("the"):
                        word_ = word_[2:]
                    if len(word_) >= 4:
                        if word_.lower() not in {"a", "an", "the"} or start_ != 0:
                            words_.append(word_)
                        start_ = end_
                        found_ = True
                    else:
                        len_3_word_.append(word_)
                    break
            if not found_:
                start_ += 1
        segmented_sentences_.append(words_)

    term_ = [wd.lower() for wd_ in len_3_word_ if len(wd_) >= 3 and wordnet.synsets(word_, pos=wordnet.NOUN) and word_.lower() not in stop_words and word_ not in punctuation]
    for x_ in segmented_sentences_:
        for word_ in x_:
            if word_ not in stop_words and word_ in english_words:
                term_.append(word_)
    return term_

In [212]:
from nltk.stem import PorterStemmer
from nltk.corpus import words
import re

# Initialize Porter Stemmer
porter = PorterStemmer()

def preprocess_document(new_document_):
    
    tokens_ = []
    token_exist_in_vocab_ = []
    tokens_no_further_processing_required_ = []
    clean_token_ = []
    token_contain_only_numbers_ = []
    tokens_no_further_processing_required1_ = []
    clean_token1_ = []
    token_contain_only_numbers1_ = []
    tokens_no_further_processing_required2_ = []
    clean_token2_ = []
    token_contain_only_numbers2_ = []
    term_ = []
    stem_terms_ = []
    
    # Tokenization
    tokens_ = re.findall(r'\b\w+\b', new_document_.lower())
    
    # Cleaning pipeline
    token_have_hyphen_ = [x for x in tokens_]
    token_link_ = [x for x in token_have_hyphen_ if x.startswith('//') or x.startswith('www') or x.startswith('org')]
    token_have_hyphen_ = [x for x in token_have_hyphen_ if not (x.startswith('//') or x.startswith('www') or x.startswith('org'))]

    hyphen_term_ = [x for x in token_have_hyphen_ if x.count('-') == 1 and '-' not in [x[0], x[-1]] and 4 <= len(x.split('-')[0]) + len(x.split('-')[1]) <= 20]
    token_have_hyphen_ = [x for x in token_have_hyphen_ if (x,) not in hyphen_term_]

    token_link_ = [x for x in token_have_hyphen_ if x.startswith('//') or x.startswith('www') or x.startswith('org')]
    token_have_hyphen_ = [x for x in token_have_hyphen_ if not (x.startswith('//') or x.startswith('www') or x.startswith('org'))]

    token_exist_in_vocab_ = [x for x in token_have_hyphen_ if x in words.words()]
    token_have_hyphen_ = [x for x in token_have_hyphen_ if x not in words.words()]

    tokens_no_further_processing_required_, clean_token_, token_contain_only_numbers_ = cleaning_pipeline_(token_have_hyphen_)
    all_number_ = token_contain_only_numbers_
    tokens_no_further_processing_required1_, clean_token1_, token_contain_only_numbers1_ = cleaning_pipeline_(hyphen_term_)
    all_number_ += token_contain_only_numbers1_

    # Token separation and further processing
    tokens_no_further_processing_required2_, clean_token2_, token_contain_only_numbers2_ = token_seperator_(clean_token_)
    term_ = token_exist_in_vocab_ + tokens_no_further_processing_required_ + tokens_no_further_processing_required1_ + tokens_no_further_processing_required2_
    
    # Stemming
    stem_terms_ = []
    for x in term_:
        stemmed_x = porter.stem(x)
        stem_terms_.append(stemmed_x)
    
    return stem_terms_



## Text Classification using TF-IDF and KNN

This script classifies text documents into predefined categories using TF-IDF (Term Frequency-Inverse Document Frequency) feature extraction and K-Nearest Neighbors (KNN) classification.

### Steps:

1. **Preprocessing**:
   - The `preprocess_document` function tokenizes and preprocesses the input text document, producing a list of stemmed terms.
   
2. **TF-IDF Transformation**:
   - The script calculates the TF-IDF values for each term in the document using a pre-trained TF-IDF weight matrix (`weight_word`).
   
3. **KNN Classification**:
   - For each term in the document, if it exists in the TF-IDF weight matrix, the corresponding TF-IDF value is retrieved. Otherwise, a default value of 0 is assigned.
   - The TF-IDF values are padded with zeros to match the required input shape for classification.
   - The KNN classifier (`knn`) predicts the category of the document based onional details or instructions as needed!

In [213]:
new_document = ''' explainability in Artificial Intelligence (AI) has been revived as a topic ofactive research by the need of conveying safety and trust to users in the
“how”and “why”of automated decision-making in different applications '''

tokens = preprocess_document(new_document)

tfidf_values = []

for token in tokens:
    if token in weight_word.columns:
        tfidf_value = weight_word[token].iloc[0] 
        tfidf_values.append(tfidf_value)
    else:
        tfidf_values.append(0)  

tfidf_values += [0] * (1000 - len(tfidf_values))
tfidf_array = np.array(tfidf_values).reshape(1, -1) 
predicted_category = knn.predict(tfidf_array)

print("Predicted category:", predicted_category)

Predicted category: [3]
