Skip to content

Commit 8b0cd03

Browse files
added tf_idf_generator.py
1 parent 6311956 commit 8b0cd03

File tree

1 file changed

+125
-0
lines changed

1 file changed

+125
-0
lines changed

tf_idf_generator.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
'''@Author: Anurag Kumar(mailto:anuragkumarak95@gmail.com)
2+
This module is used for generating a TF-IDF file or values from a list of files that contains docs.
3+
4+
python:
5+
- 3.5
6+
7+
pre-requisites:
8+
- colorama==0.3.9
9+
10+
sample file format of input:
11+
12+
##START(NOT INCLUDED)
13+
sport smile today because signs Gemini
14+
little sister dealt severe allergy figure
15+
about looks gender color attitude nationality respect
16+
added video playlist Sonic Fightstick Edition
17+
weeks birthday scott wants camping keeper
18+
photo taking photo trying auction scale photo
19+
happy creatively capture story stage magical
20+
yoongi looks seokjin looking yoongi looking seokjin
21+
taking glasses because buffering cannot handle
22+
tried Michelle Obama proceeded defend whole pointless
23+
robbed shades backstage reading guess karma stealing
24+
remains sailors destroyer McCain collision found
25+
timeline beginnings infographics Catch upcoming debut
26+
##END(NOT INCLUDED)
27+
28+
here, every line represents a document.
29+
'''
30+
import os, math, pickle
31+
from colorama import Fore, Style
32+
import pickle
33+
34+
switcher = {
35+
'r':Fore.RED,
36+
'bk':Fore.BLACK,
37+
'b':Fore.BLUE,
38+
'g':Fore.GREEN,
39+
'y':Fore.YELLOW,
40+
'm':Fore.MAGENTA,
41+
'c':Fore.CYAN,
42+
'w':Fore.WHITE
43+
}
44+
def paint(str,color='r'):
45+
'''Utility func, for printing colorful logs in console...
46+
47+
@args:
48+
--
49+
str : String to be modified.
50+
color : color code to which the string will be formed. default is 'r'=RED
51+
52+
@returns:
53+
--
54+
str : final modified string with foreground color as per parameters.
55+
56+
'''
57+
if color in switcher:
58+
str = switcher[color]+str+Style.RESET_ALL
59+
return str
60+
61+
TAG = paint('TF-IDF-GENE/','b')
62+
def find_tf_idf(file_names=['./../test/testdata'],prev_file_path=None, dump_path=None):
63+
'''Function to create a TF-IDF list of dictionaries for a corpus of docs.
64+
If you opt for dumping the data, you can provide a file_path with .tfidfpkl extension(standard made for better understanding)
65+
and also re-generate a new tfidf list which overrides over an old one by mentioning its path.
66+
67+
@Args:
68+
--
69+
file_names : paths of files to be processed on, you can give many small sized file, rather than one large file.
70+
prev_file_path : path of old .tfidfpkl file, if available. (default=None)
71+
dump_path : directory-path where to dump generated lists.(default=None)
72+
73+
@returns:
74+
--
75+
idf : a dict of unique words in corpus,with their document frequency as values.
76+
tf_idf : the generated tf-idf list of dictionaries for mentioned docs.
77+
'''
78+
tf_idf = [] # will hold a dict of word_count for every doc(line in a doc in this case)
79+
idf = {}
80+
81+
# this statement is useful for altering existant tf-idf file and adding new docs in itself.(## memory is now the biggest issue)
82+
if prev_file_path:
83+
print(TAG,'modifying over exising file.. @',prev_file_path)
84+
idf,tf_idf = pickle.load(open(prev_file_path,'rb'))
85+
prev_doc_count = len(idf)
86+
prev_corpus_length = len(tf_idf)
87+
88+
for f in file_names:
89+
90+
file1 = open(f,'r') # never use 'rb' for textual data, it creates something like, {b'line-inside-the-doc'}
91+
92+
#create word_count dict for all docs
93+
for line in file1:
94+
dict = {}
95+
#find the amount of doc a word is in
96+
for i in set(line.split()):
97+
if i in idf: idf[i] +=1
98+
else: idf[i] =1
99+
for word in line.split():
100+
#find the count of all words in every doc
101+
if word not in dict:
102+
dict[word] = 1
103+
else:
104+
dict[word] += 1
105+
tf_idf.append(dict)
106+
file1.close()
107+
108+
#calculating final TF-IDF values for all words in all docs(line in a doc in this case)
109+
for doc in tf_idf:
110+
for key in doc:
111+
true_idf = math.log(len(tf_idf)/idf[key])
112+
true_tf = doc[key]/len(doc)
113+
doc[key] = true_tf * true_idf
114+
115+
# do not get overwhelmed, just for logging the quantity of words that have been processed.
116+
print(TAG,'Total number of unique words in corpus',len(idf),'( '+paint('++'+str(len(idf)-prev_doc_count),'g')+' )' if prev_file_path else '')
117+
print(TAG,'Total number of docs in corpus:',len(tf_idf),'( '+paint('++'+str(len(tf_idf)-prev_corpus_length),'g')+' )' if prev_file_path else '')
118+
119+
# dump if a dir-path is given
120+
if dump_path:
121+
if dump_path[-8:] == 'tfidfpkl':
122+
pickle.dump((idf,tf_idf),open(dump_path,'wb'),protocol=pickle.HIGHEST_PROTOCOL)
123+
print(TAG,'Dumping TF-IDF vars @',dump_path)
124+
return idf,tf_idf
125+

0 commit comments

Comments
 (0)