<a href="https://colab.research.google.com/github/yunssup/Business_Text_Mining/blob/main/TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [1]:
corpus = [
    'The planet, Neptune, is the furthest planet from the sun',
    'Jupiter is the largest planet',
    'Mars is the fourth planet from the sun'
]

In [4]:
def preprocess(doc):
    doc = nlp(doc)

    preprocessed_tokens = []
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        else:
            preprocessed_tokens.append(token.lemma_)
    return " ".join(preprocessed_tokens)

In [7]:
corpus_preprocessed = [preprocess(d) for d in corpus]
print (corpus_preprocessed)

['planet Neptune furth planet sun', 'Jupiter large planet', 'Mars fourth planet sun']


## <font color=blue> 1. BOW (aka. Term Frequency) </font>

## <font color=green> Scikit-learn's CountVectorizer Methods </font>

<table>
<tr><td>`fit(raw_documents[, y])`</td><td>Learn a vocabulary dictionary of all tokens in the raw documents.</td></tr>
<tr><td>`transform(raw_documents)`</td><td>Transform documents to document-term matrix.</td></tr>
<tr><td>`fit_transform(raw_documents[, y])`</td><td>Learn the vocabulary dictionary and return document-term matrix.</td></tr>
<tr><td>`get_feature_names_out([input_features])`</td><td>Get output feature names for transformation.</td></tr>
<tr><td>`vocabulary_`</td><td>A dictionary where keys are terms and values are indices in the feature matrix.</td></tr>

</table>

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()


In [11]:
corpus_cv = cv.fit_transform(corpus_preprocessed)
corpus_cv

<3x8 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [14]:
print(corpus_cv.toarray())

[[0 1 0 0 0 1 2 1]
 [0 0 1 1 0 0 1 0]
 [1 0 0 0 1 0 1 1]]


In [15]:
cv.get_feature_names_out()

array(['fourth', 'furth', 'jupiter', 'large', 'mars', 'neptune', 'planet',
       'sun'], dtype=object)

In [18]:
# Create Dataframe
import pandas as pd

df_bow = pd.DataFrame(corpus_cv.toarray(), columns = cv.get_feature_names_out())
print(df_bow)

   fourth  furth  jupiter  large  mars  neptune  planet  sun
0       0      1        0      0     0        1       2    1
1       0      0        1      1     0        0       1    0
2       1      0        0      0     1        0       1    1


In [19]:
df_bow = df_bow.transpose()
print(df_bow)

         0  1  2
fourth   0  0  1
furth    1  0  0
jupiter  0  1  0
large    0  1  0
mars     0  0  1
neptune  1  0  0
planet   2  1  1
sun      1  0  1


In [21]:
df_bow.columns = ['BOW-Doc{}'.format(i+1) for i in range(len(corpus_preprocessed))]
print(df_bow)

         BOW-Doc1  BOW-Doc2  BOW-Doc3
fourth          0         0         1
furth           1         0         0
jupiter         0         1         0
large           0         1         0
mars            0         0         1
neptune         1         0         0
planet          2         1         1
sun             1         0         1


---
## <font color=blue> 2. TF-IDF </font>

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

In [29]:
corpus_v = vectorizer.fit_transform(corpus_preprocessed)
corpus_v

<3x8 sparse matrix of type '<class 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [49]:
tfidf_score = corpus_v.toarray()
print(tfidf_score)

[[0.         0.50165133 0.         0.         0.         0.50165133
  0.59256672 0.38151877]
 [0.         0.         0.65249088 0.65249088 0.         0.
  0.38537163 0.        ]
 [0.5844829  0.         0.         0.         0.5844829  0.
  0.34520502 0.44451431]]


In [50]:
import pandas as pd

df_tfidf = pd.DataFrame(tfidf_score, columns = vectorizer.get_feature_names_out())
print(df_tfidf)

     fourth     furth   jupiter     large      mars   neptune    planet  \
0  0.000000  0.501651  0.000000  0.000000  0.000000  0.501651  0.592567   
1  0.000000  0.000000  0.652491  0.652491  0.000000  0.000000  0.385372   
2  0.584483  0.000000  0.000000  0.000000  0.584483  0.000000  0.345205   

        sun  
0  0.381519  
1  0.000000  
2  0.444514  


In [51]:
df_tfidf = df_tfidf.transpose()
print(df_tfidf)

                0         1         2
fourth   0.000000  0.000000  0.584483
furth    0.501651  0.000000  0.000000
jupiter  0.000000  0.652491  0.000000
large    0.000000  0.652491  0.000000
mars     0.000000  0.000000  0.584483
neptune  0.501651  0.000000  0.000000
planet   0.592567  0.385372  0.345205
sun      0.381519  0.000000  0.444514


In [58]:
df_tfidf = df_tfidf.transpose()
df_tfidf.columns = ['Tfidf-Doc{}'.format(i+1) for i in range(len(corpus_preprocessed))]
print(df_tfidf)

         Tfidf-Doc1  Tfidf-Doc2  Tfidf-Doc3
fourth     0.000000    0.000000    0.584483
furth      0.501651    0.000000    0.000000
jupiter    0.000000    0.652491    0.000000
large      0.000000    0.652491    0.000000
mars       0.000000    0.000000    0.584483
neptune    0.501651    0.000000    0.000000
planet     0.592567    0.385372    0.345205
sun        0.381519    0.000000    0.444514


In [53]:
# Merge Dataframes for comparison
df_merged = df_bow.join(df_tfidf)
print(df_merged)

         BOW-Doc1  BOW-Doc2  BOW-Doc3         0         1         2
fourth          0         0         1  0.000000  0.000000  0.584483
furth           1         0         0  0.501651  0.000000  0.000000
jupiter         0         1         0  0.000000  0.652491  0.000000
large           0         1         0  0.000000  0.652491  0.000000
mars            0         0         1  0.000000  0.000000  0.584483
neptune         1         0         0  0.501651  0.000000  0.000000
planet          2         1         1  0.592567  0.385372  0.345205
sun             1         0         1  0.381519  0.000000  0.444514


In [40]:
vectorizer.vocabulary_

{'planet': 6,
 'neptune': 5,
 'furth': 1,
 'sun': 7,
 'jupiter': 2,
 'large': 3,
 'mars': 4,
 'fourth': 0}

In [46]:
# Get index number of term
vectorizer.vocabulary_.get('planet')

6

In [47]:
# Get idf score of term
vectorizer.idf_[6]

1.0

In [57]:
# Get idf score of terms
terms = vectorizer.get_feature_names_out()

for term in terms:
    index = vectorizer.vocabulary_.get(term)
    print(f'{term:{10}} {vectorizer.idf_[index]}')

fourth     1.6931471805599454
furth      1.6931471805599454
jupiter    1.6931471805599454
large      1.6931471805599454
mars       1.6931471805599454
neptune    1.6931471805599454
planet     1.0
sun        1.2876820724517808
