|
| 1 | +'''@Author: Anurag Kumar(mailto:anuragkumarak95@gmail.com) |
| 2 | +This module is used for generating a TF-IDF file or values from a list of files that contains docs. |
| 3 | +
|
| 4 | +What is TF-IDF : https://en.wikipedia.org/wiki/Tf%E2%80%93idf |
| 5 | +
|
| 6 | +python: |
| 7 | + - 3.5 |
| 8 | +
|
| 9 | +pre-requisites: |
| 10 | + - colorama==0.3.9 |
| 11 | +
|
| 12 | +sample file format of input: |
| 13 | +
|
| 14 | + ##START(NOT INCLUDED) |
| 15 | + sport smile today because signs Gemini |
| 16 | + little sister dealt severe allergy figure |
| 17 | + about looks gender color attitude nationality respect |
| 18 | + added video playlist Sonic Fightstick Edition |
| 19 | + weeks birthday scott wants camping keeper |
| 20 | + photo taking photo trying auction scale photo |
| 21 | + happy creatively capture story stage magical |
| 22 | + yoongi looks seokjin looking yoongi looking seokjin |
| 23 | + taking glasses because buffering cannot handle |
| 24 | + tried Michelle Obama proceeded defend whole pointless |
| 25 | + robbed shades backstage reading guess karma stealing |
| 26 | + remains sailors destroyer McCain collision found |
| 27 | + timeline beginnings infographics Catch upcoming debut |
| 28 | + ##END(NOT INCLUDED) |
| 29 | +
|
| 30 | +here, every line represents a document. |
| 31 | +
|
| 32 | +have fun, cheers. |
| 33 | +''' |
| 34 | +import os, math, pickle |
| 35 | +from colorama import Fore, Style |
| 36 | + |
| 37 | +switcher = { |
| 38 | + 'r':Fore.RED, |
| 39 | + 'bk':Fore.BLACK, |
| 40 | + 'b':Fore.BLUE, |
| 41 | + 'g':Fore.GREEN, |
| 42 | + 'y':Fore.YELLOW, |
| 43 | + 'm':Fore.MAGENTA, |
| 44 | + 'c':Fore.CYAN, |
| 45 | + 'w':Fore.WHITE |
| 46 | +} |
| 47 | +def paint(str,color='r'): |
| 48 | + '''Utility func, for printing colorful logs in console... |
| 49 | +
|
| 50 | + @args: |
| 51 | + -- |
| 52 | + str : String to be modified. |
| 53 | + color : color code to which the string will be formed. default is 'r'=RED |
| 54 | +
|
| 55 | + @returns: |
| 56 | + -- |
| 57 | + str : final modified string with foreground color as per parameters. |
| 58 | +
|
| 59 | + ''' |
| 60 | + if color in switcher: |
| 61 | + str = switcher[color]+str+Style.RESET_ALL |
| 62 | + return str |
| 63 | + |
| 64 | +TAG = paint('TF-IDF-GENE/','b') |
| 65 | +def find_tf_idf(file_names=['./../test/testdata'],prev_file_path=None, dump_path=None): |
| 66 | + '''Function to create a TF-IDF list of dictionaries for a corpus of docs. |
| 67 | + If you opt for dumping the data, you can provide a file_path with .tfidfpkl extension(standard made for better understanding) |
| 68 | + and also re-generate a new tfidf list which overrides over an old one by mentioning its path. |
| 69 | +
|
| 70 | + @Args: |
| 71 | + -- |
| 72 | + file_names : paths of files to be processed on, you can give many small sized file, rather than one large file. |
| 73 | + prev_file_path : path of old .tfidfpkl file, if available. (default=None) |
| 74 | + dump_path : directory-path where to dump generated lists.(default=None) |
| 75 | +
|
| 76 | + @returns: |
| 77 | + -- |
| 78 | + idf : a dict of unique words in corpus,with their document frequency as values. |
| 79 | + tf_idf : the generated tf-idf list of dictionaries for mentioned docs. |
| 80 | + ''' |
| 81 | + tf_idf = [] # will hold a dict of word_count for every doc(line in a doc in this case) |
| 82 | + idf = {} |
| 83 | + |
| 84 | + # this statement is useful for altering existant tf-idf file and adding new docs in itself.(## memory is now the biggest issue) |
| 85 | + if prev_file_path: |
| 86 | + print(TAG,'modifying over exising file.. @',prev_file_path) |
| 87 | + idf,tf_idf = pickle.load(open(prev_file_path,'rb')) |
| 88 | + prev_doc_count = len(idf) |
| 89 | + prev_corpus_length = len(tf_idf) |
| 90 | + |
| 91 | + for f in file_names: |
| 92 | + |
| 93 | + file1 = open(f,'r') # never use 'rb' for textual data, it creates something like, {b'line-inside-the-doc'} |
| 94 | + |
| 95 | + #create word_count dict for all docs |
| 96 | + for line in file1: |
| 97 | + dict = {} |
| 98 | + #find the amount of doc a word is in |
| 99 | + for i in set(line.split()): |
| 100 | + if i in idf: idf[i] +=1 |
| 101 | + else: idf[i] =1 |
| 102 | + for word in line.split(): |
| 103 | + #find the count of all words in every doc |
| 104 | + if word not in dict: |
| 105 | + dict[word] = 1 |
| 106 | + else: |
| 107 | + dict[word] += 1 |
| 108 | + tf_idf.append(dict) |
| 109 | + file1.close() |
| 110 | + |
| 111 | + #calculating final TF-IDF values for all words in all docs(line in a doc in this case) |
| 112 | + for doc in tf_idf: |
| 113 | + for key in doc: |
| 114 | + true_idf = math.log(len(tf_idf)/idf[key]) |
| 115 | + true_tf = doc[key]/len(doc) |
| 116 | + doc[key] = true_tf * true_idf |
| 117 | + |
| 118 | + # do not get overwhelmed, just for logging the quantity of words that have been processed. |
| 119 | + print(TAG,'Total number of unique words in corpus',len(idf),'( '+paint('++'+str(len(idf)-prev_doc_count),'g')+' )' if prev_file_path else '') |
| 120 | + print(TAG,'Total number of docs in corpus:',len(tf_idf),'( '+paint('++'+str(len(tf_idf)-prev_corpus_length),'g')+' )' if prev_file_path else '') |
| 121 | + |
| 122 | + # dump if a dir-path is given |
| 123 | + if dump_path: |
| 124 | + if dump_path[-8:] != 'tfidfpkl': raise Exception(TAG+"Please provide a .tfidfpkl file_path, it is the standard format of this module.") |
| 125 | + pickle.dump((idf,tf_idf),open(dump_path,'wb'),protocol=pickle.HIGHEST_PROTOCOL) |
| 126 | + print(TAG,'Dumping TF-IDF vars @',dump_path) |
| 127 | + return idf,tf_idf |
| 128 | + |
0 commit comments