<!--  -->

#### Statistical Word Embedding
### **tf-idf**
this algorithm is mainly running around two concepts
- importance of a word w in a document d
\begin{align}
TF(w,d) &= \frac{number \; of \; times \; a \; word \; w \; appears \; in \; a \; document \; d}{total \; number \; of \; words \; in \; document \; d}
\end{align}
        
- is this an special word in document corpus
\begin{align}
IDF(w,S) &= \frac{total \; number \; of \; documents \; in \; dataset \; S}{number \; of \; documents \; contain \; word \; w}
\end{align}
+ Pseudo code:

*  calculate the TF and IDF of each word in an corpus
*  these act as vector embeddings to compare the relavence and importance of an word in given word corpus

+ Sources
* [Good Source](https://www.geeksforgeeks.org/machine-learning/understanding-tf-idf-term-frequency-inverse-document-frequency/)

## 1. My Own Implementation

In [1]:
import numpy as np
import math
import json
import string
import re

In [3]:
from extract_rewedata import extract_rewedata


In [4]:
df = extract_rewedata()
df

CSV file already exists, skipping conversion.
CSV file read successfully!
               filename                                           document
0        REWE-03-05.pdf  REWE Ströbel oHG\nFleiner Str. 26\n74072 Heilb...
1        REWE-05-05.pdf  REWE Ströbel oHG\nFleiner Str. 26\n74072 Heilb...
2        REWE-08-05.pdf  R E W E\n****** Etzelstr. ******\n****** 74076...
3        REWE-09-05.pdf  REWE Ströbel oHG\nFleiner Str. 26\n74072 Heilb...
4  REWE-eBon (1)(1).pdf  REWE Ströbel oHG\nFleiner Str. 26\n74072 Heilb...


Unnamed: 0,filename,document
0,REWE-03-05.pdf,REWE Ströbel oHG\nFleiner Str. 26\n74072 Heilb...
1,REWE-05-05.pdf,REWE Ströbel oHG\nFleiner Str. 26\n74072 Heilb...
2,REWE-08-05.pdf,R E W E\n****** Etzelstr. ******\n****** 74076...
3,REWE-09-05.pdf,REWE Ströbel oHG\nFleiner Str. 26\n74072 Heilb...
4,REWE-eBon (1)(1).pdf,REWE Ströbel oHG\nFleiner Str. 26\n74072 Heilb...
5,REWE-eBon (1).pdf,REWE Ströbel oHG\nFleiner Str. 26\n74072 Heilb...
6,REWE-eBon (2)(1).pdf,REWE Ströbel oHG\nFleiner Str. 26\n74072 Heilb...
7,REWE-eBon (2).pdf,REWE Ströbel oHG\nFleiner Str. 26\n74072 Heilb...
8,REWE-eBon (3)(1).pdf,REWE Ströbel oHG\nFleiner Str. 26\n74072 Heilb...
9,REWE-eBon (3).pdf,R E W E\n****** Etzelstr. ******\n****** 74076...


### 1.1. Preprocessing Documents

In [5]:
def preprocessing_document(docu):
    """
    Here a simple preprocessing implemented
    1. convert <space> into <\n>
    2. divide string at each <\n> 
    
    return: list of words of single document
    """
    # docu_space_into_newline = docu_space_into_newline.replace(re.escape(string.punctuation), '')
    docu_space_into_newline = docu.replace(' ', '\n')
    docu_space_into_newline = docu_space_into_newline.lower()
    docu_words_at_newline = docu_space_into_newline.split('\n')
    # print(len(docu_words_at_newline), type(docu_words_at_newline))
    # arr = np.array(docu_1_re_split)
    return docu_words_at_newline


### 1.2. Calculate TF(Term Frequency)

In [6]:
def calculate_TF(docu_list, focus_word):
    word_count = 0
    for word in docu_list:
        if focus_word == word:
            word_count+=1
    
    return word_count, word_count/len(docu_list)


In [7]:
# Single Document Each Word TF calculation
docu_1_re_split = preprocessing_document(df.iloc[0, 1])
print(docu_1_re_split[:10])
# for word in docu_1_re_split:
#     print(word, calculate_TF(docu_1_re_split, word), '\n')

['rewe', 'ströbel', 'ohg', 'fleiner', 'str.', '26', '74072', 'heilbronn', 'uid', 'nr.:']


In [8]:
def per_docu_TF(corpus):
    docu_dict={}
    for c1, c2 in corpus.iterrows():
        docu=preprocessing_document(c2['document'])
        count_tf_dict={}
        for word in docu:
            count, tf = calculate_TF(docu, word)
            if not word in count_tf_dict.keys():
                count_tf_dict[word]=[count, tf]
        docu_dict[c2['filename']] = count_tf_dict
        count_tf_dict={}

    return docu_dict

In [14]:
from itertools import islice

print(json.dumps(dict(islice(per_docu_TF(df)["REWE-03-05.pdf"].items(), 5)), indent=2))
# per_docu_TF(df)

{
  "rewe": [
    5,
    0.04
  ],
  "str\u00f6bel": [
    1,
    0.008
  ],
  "ohg": [
    1,
    0.008
  ],
  "fleiner": [
    1,
    0.008
  ],
  "str.": [
    1,
    0.008
  ]
}


### 1.3. Calculate IDF(Inverse Document Frequency)

In [15]:
def word_in_docu(docu, word):
    for each in docu:
        if word == each:
            return True
    return False

def calculate_IDF(docu_corpus, word=None):
    docu_count = 0
    for each in docu_corpus['document']:
        if word_in_docu(preprocessing_document(each), word):
            docu_count+=1

    if docu_count>0:
        return docu_count, math.log(docu_corpus.shape[0]/docu_count)
    else:
        return docu_count, 0

  
print(calculate_IDF(df, '26'))

(20, 0.3364722366212129)


In [16]:
def per_docu_IDF(corpus):
    docu_dict={}
    for c1, c2 in corpus.iterrows():
        docu=preprocessing_document(c2['document'])
        count_idf_dict={}
        for word in docu:
            count, idf = calculate_IDF(corpus, word)
            if not word in count_idf_dict.keys():
                count_idf_dict[word]=[count, idf]
        docu_dict[c2['filename']] = count_idf_dict
        count_idf_dict={}

    return docu_dict

In [17]:
# for whole corpus
# print(json.dumps(per_docu_IDF(df), indent=2))
print(json.dumps(dict(islice(per_docu_IDF(df)["REWE-03-05.pdf"].items(), 5)), indent=2))

{
  "rewe": [
    28,
    0.0
  ],
  "str\u00f6bel": [
    20,
    0.3364722366212129
  ],
  "ohg": [
    20,
    0.3364722366212129
  ],
  "fleiner": [
    20,
    0.3364722366212129
  ],
  "str.": [
    20,
    0.3364722366212129
  ]
}


### 1.4. Calculate the tf_idf score

finally a high tf_idf score tells how important this word in this document and how rare it is across whole corpus

In [18]:
def tf_idf_score():
    corpus_tf = per_docu_TF(df)
    corpus_idf = per_docu_IDF(df)
    tf_idf_score = []
    for docu in corpus_tf.keys():
        local = []
        for tf, idf in zip(corpus_tf[docu].values(), corpus_idf[docu].values()):
            local.append(round(tf[1]*idf[1], 3))
        tf_idf_score.append(local)
        
    return tf_idf_score

tf_idf_score()[0][:10]


[0.0, 0.003, 0.003, 0.003, 0.003, 0.003, 0.003, 0.0, 0.0, 0.0]

## 2. Validate with Scikit Learn

In [19]:
# import required module
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [20]:
# merge all docus
string = []
for each in df['document']:
    string.append(each)

string[:5]




In [21]:
# create object
tfidf = TfidfVectorizer()

# get tf-df values
result = tfidf.fit_transform(string)

In [22]:

vectorizer = CountVectorizer()

tf_matrix = vectorizer.fit_transform(string)
words = vectorizer.get_feature_names_out()
# get tf values
# Say you want the TF of the word "sentence" in document 2 (index 1)
doc_index = 1
word = "26"

# Get the column index of the word
word_index = list(words).index(word)

# Get TF count
tf_value = tf_matrix[doc_index, word_index]

print(f'TF of "{word}" in document {doc_index} is:', tf_value)

TF of "26" in document 1 is: 3


In [23]:
# get idf values
print('\nidf values:')
for ele1, ele2 in zip(tfidf.get_feature_names_out(), tfidf.idf_):
    if ele1=="26":
        print(ele1, ':', ele2)


idf values:
26 : 1.3227733922630511


In [24]:
# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)



Word indexes:
{'rewe': 539, 'ströbel': 570, 'ohg': 510, 'fleiner': 383, 'str': 569, '26': 104, '74072': 226, 'heilbronn': 421, 'uid': 593, 'nr': 504, 'de294160442': 341, 'eur': 366, 'bio': 302, 'joghurt': 446, '39': 153, 'fusilli': 386, '49': 176, 'summe': 571, '88': 245, 'geg': 392, 'american': 282, 'express': 371, 'steuer': 566, 'netto': 496, 'brutto': 311, '63': 207, '25': 101, 'gesamtbetrag': 396, 'tse': 588, 'signatur': 559, 'r01clqa9ubtqlg': 531, 'crmwlk1bxla': 328, 'kbv9': 455, 'h4c0hwct6zyxya871pkkpt6oghlmdozrh': 413, 't4yivublyuahc': 573, 'vg': 599, 'n35owysuc': 491, 'hvv': 429, 'yb': 615, '8ldfncxvtqmhfysklpvyxcvijvn': 250, 'signaturzähler': 560, '3501987': 141, 'transaktion': 585, '1685570': 51, 'start': 565, '2025': 71, '05': 13, '03t18': 9, '44': 168, '27': 108, '000': 1, 'stop': 568, '42': 162, 'seriennnummer': 557, 'kasse': 454, '74': 225, '56': 192, '3c': 157, 'd7': 334, 'a9': 273, '85': 243, '00': 0, '03': 7, '18': 62, 'bon': 308, '6040': 202, 'markt': 478, '5101': 18

In [None]:
# display tf-idf values
print('\ntf-idf value:')
print(result)



tf-idf value:
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 112 stored elements and shape (1, 634)>
  Coords	Values
  (0, 539)	0.2299405601322187
  (0, 570)	0.043451322106423
  (0, 510)	0.043451322106423
  (0, 383)	0.043451322106423
  (0, 569)	0.043451322106423
  (0, 104)	0.043451322106423
  (0, 226)	0.043451322106423
  (0, 421)	0.03284865144745981
  (0, 593)	0.03284865144745981
  (0, 504)	0.06569730289491962
  (0, 341)	0.043451322106423
  (0, 366)	0.13139460578983925
  (0, 302)	0.10476795588631571
  (0, 446)	0.06469215246350515
  (0, 153)	0.075152928079161
  (0, 386)	0.10737184634081555
  (0, 176)	0.05677030411677471
  (0, 571)	0.03284865144745981
  (0, 245)	0.48276331340466905
  (0, 392)	0.03284865144745981
  (0, 282)	0.03284865144745981
  (0, 371)	0.03284865144745981
  (0, 566)	0.06569730289491962
  (0, 496)	0.03284865144745981
  (0, 311)	0.03284865144745981
  :	:
  (0, 326)	0.03284865144745981
  (0, 607)	0.03284865144745981
  (0, 604)	0.03284865144745981
  (0, 399)

In [30]:
# in matrix form
print('\ntf-idf values in matrix form:')
print(result.toarray()[:5])


tf-idf values in matrix form:
[[0.03284865 0.0656973  0.         ... 0.         0.         0.        ]
 [0.03706439 0.07412878 0.         ... 0.         0.         0.        ]
 [0.03069027 0.06138054 0.         ... 0.         0.         0.        ]
 [0.03516989 0.07033978 0.         ... 0.         0.         0.        ]
 [0.03528713 0.07057427 0.         ... 0.         0.         0.        ]]
