# TF-IDF(Term Frequency - Inverse Document Frequency)

$$TF\_IDF = TF * IDF$$
<br>
$$tf(word_i, doc_j) = \frac{num(word_i\_in\_doc_j)}{num(all\_words\_in\_all\_docs)}$$
<br>
$$idf(word_i) = log(\frac{num(all\_docs)}{num(docs\_which\_include\_word_i)})$$

tf-idf(i, j): 文書jにおける単語iの重要性  
tf(i, j): 全文書の全単語に対する，ある文書jにおける，単語iの占める割合  
idf(i): 全文書のうち，単語iを含む文書の割合

In [3]:
import math
import pandas as pd
import math
import warnings 
warnings.filterwarnings('ignore')

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
def computeTF(word_dict, bow):
    tf_dict = {}
    bow_cnt = len(bow)
    for word, cnt in word_dict.items():
        tf_dict[word] = cnt / float(bow_cnt)
    return tf_dict

def computeIDF(doc_dict):
    N = len(doc_dict)
    idf_dict = dict(zip(doc_dict[0].keys(), [0]*len(doc_dict[0])) )
    for doc in doc_dict:
        for word, val in doc.items():
            if val > 0:
                idf_dict[word] += 1
    
    for word, val in idf_dict.items():
        idf_dict[word] = math.log(N / float(val))
    return idf_dict

def computeTFIDF(tf_bow, idfs):
    tfidf = {}
    for word, val in tf_bow.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [5]:
def doc2TFIDF(docA, docB):
    # Bag of Words
    bowA = docA.split(' ')
    bowB = docB.split(' ')
    unique = set(bowA + bowB)
    # print(unique)

    num_wordsA = dict(zip(unique, [0]*len(unique)))
    for word in bowA:
        num_wordsA[word] += 1
    num_wordsB = dict(zip(unique, [0]*len(unique)))
    for word in bowB:
        num_wordsB[word] += 1

#     stop_words = stopwords.words('english')

    tfA = computeTF(num_wordsA, bowA)
    tfB = computeTF(num_wordsB, bowB)
    # tfA, tfB

    doc_dict = [num_wordsA, num_wordsB]
    idfs = computeIDF(doc_dict)
    # idfs

    tfidfA = computeTFIDF(tfA, idfs)
    tfidfB = computeTFIDF(tfB, idfs)
    df = pd.DataFrame([tfidfA, tfidfB])

    print(df)

# 参考
[TF IDF | TFIDF Python Example](https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76)

In [36]:
result=[]
LI0630 = '../LI22/20220630'
with open(os.path.join(LI0630, "data_org.txt")) as f:
    for i, line in enumerate(f):
        if line.isspace():
            print('space line is detected !\n')
            continue
        # strip(): デフォルトでは両端の連続する空白文字が取り除かれる。
        #改行\nや全角スペース\u3000やタブ\tなどが空白文字とみなされ削除される。
        line=line.strip() 
        result.append(line)
        
with open(os.path.join(LI0630, "result.txt"),"w") as f:
    for line in result:
        f.write(line+"\n")
print(f'num lines in document == {i}')

space line is detected !

num lines in document == 3


In [32]:

bag_of_words[-10:], len(bag_of_words)

(['child',
  'would',
  'not',
  'be',
  'wearing',
  'shorts',
  'in',
  'below',
  'freezing',
  'weather'],
 10220)

In [25]:
def computeTF(word_dict, bow):
    tf_dict = {}
    bow_cnt = len(bow)
    for word, cnt in word_dict.items():
        tf_dict[word] = cnt / float(bow_cnt)
    return tf_dict

path = '/Users/nakatani/Desktop/22前期/言語情報学/LI22/20220630/data1.txt'
with open(path) as f:
    doc = f.read()
bag_of_words = ([word.lower() for word in doc.split()])
unique = set(bag_of_words)
num_words = dict(zip(unique, [0]*len(unique)))
for word in bag_of_words:
    num_words[word] += 1

tf = computeTF(num_words, bag_of_words)
top5 = sorted([[value, key] for key, value in tf.items()], reverse=True)[:5]
for i, value_key in enumerate(top5):
    value, key = value_key
    print(f'{i}: {key} {value}')

0: the 0.07348336594911938
1: and 0.05342465753424658
2: of 0.04442270058708415
3: long 0.043835616438356165
4: mike 0.0213307240704501


[[751, 'the'], [546, 'and'], [454, 'of'], [448, 'long'], [218, 'mike']]

In [37]:
line

'@@12747081 @1347081/ <h> Plural Possessives : Why You Put an Apostrophe After the S <p> Its common for people to wonder , " What does it mean to put an apostrophe after an S ? " It can get a bit confusing . To get it right , you need to understand what a possessive is . <p> Possessives are used to show ownership ; to show that something belongs to someone . " The presidents official airplane " is one example . The airplane belongs to the president , of course . But there is only one president ; after all , you do n\'t  have two presidents of the same country . So , this is a singular possessive made by adding an apostrophe and an " s . " <p> But with a phrase such as " the thieves jewels , " you have a plural noun : thieves . Most of the time , a plural noun will be formed by adding " s " to it . In this case , you also change the spelling but do n\'t  worry about that now . The main @ @ @ @ @ @ @ @ @ @ . " Houses , cats , clouds , essays , rainbows : these are all plural nouns . <p> 

In [38]:
import re
result = []
words = line.split(' ')
for word in words:
#     print(word)
    if re.match('<.*>', word):
        print('1: ', word)
        continue
    elif re.match('@.*', word):
        print('2: ', word)
        continue
    elif re.match('\'s', word):
        print('3: ', word)
        continue
    elif re.match('\d+', word):
        print('4: ', word)
        continue
    elif re.match('[a-zA-Z]+', word):
#         print(word)
#         result.append(word)
        continue
    else:
        print(word)
        continue

2:  @@12747081
2:  @1347081/
1:  <h>
:
1:  <p>
,
"
?
"
.
,
.
1:  <p>
;
.
"
"
.
,
.
;
,

.
,
"
.
"
1:  <p>
"
,
"
:
.
,
"
"
.
,

.
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
.
"
,
,
,
,
:
.
1:  <p>
,
(
)
"
.
"
're
,
.
3:  's
.
1:  <p>
,
,
,
,
.
1:  <h>
1:  <p>
(
;
're
)
1:  <p>
(
)
1:  <p>
(
)
1:  <p>
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
,
:
"
.
"
,
3:  's
.
1:  <p>
:
"
.
"
.
.
!
,
.
1:  <p>
.
"
!
"
,
?
.
.
"
?
"
.
,
.
"
.
"
"
"
,
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
've
,
're
.
3:  's
:
.
.
,
,
,
!
1:  <p>
.
"
"
.
"
"
.
,
,
"
.
"
.
1:  <p>
(
"
"
"
"
)
1:  <p>
(
"
"
"
"
)
1:  <p>
(
"
"
)
1:  <p>
,

.
1:  <h>
1:  <p>
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
,
3:  's
.
.
,
"
"
.
.
?
1:  <p>
.
1:  <p>
.
1:  <p>
.
?
.
.
.
.
1:  <p>
.
1:  <p>
.
1:  <p>
,
,
.
.
.
,
,
?
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
2:  @
.
"
1:  <p>
,
,
.
1:  <h>
4:  4
1:  <p>
,
.

.
.
!
1:  <p>
.
"
"
.
"
"
3:  's
.


In [35]:
import sys
import re
def remove_noise(word_line):
    result = []
    words = word_line.split(' ')
    for word in words:
        if re.match('<.*>', word):
            continue
        elif re.match('@.*', word):
            continue
        elif re.match('\'s', word):
            continue
        elif re.match('\d+', word):
            continue
        elif re.match('[a-zA-Z]+', word):
            result.append(word)
    return ' '.join(result)

# args = sys.argv
# path = args[1]
path = '/Users/nakatani/Desktop/22前期/言語情報学/LI22/20220630/data_org.txt'
result = []
with open(path) as f:
    for i, line in enumerate(f):
        if line.isspace():
            print('space line is detected !\n')
            continue
        line=line.strip() 
        result.append(remove_noise(line))
        
with open(path.replace('data_org', 'result'),"w") as f:
    print(path.replace('data_org', 'result'))
    for line in result:
        f.write(line+"\n")
# print(f'num lines in document == {i}')

space line is detected !

/Users/nakatani/Desktop/22前期/言語情報学/LI22/20220630/result.txt
