Merge pull request #232 from anuragkumarak95/master

geekcomputers · web-flow · commit 00c291f94861 · 2017-09-30T09:12:47.000+01:00
Tf-IDF Generator module and Sierpinski triangle fractal draw
diff --git a/sierpinski_triangle.py b/sierpinski_triangle.py
@@ -0,0 +1,64 @@
+'''Author Anurag Kumar | anuragkumarak95@gmail.com | git/anuragkumarak95
+
+Simple example of Fractal generation using recursive function.
+
+What is Sierpinski Triangle?
+>>The Sierpinski triangle (also with the original orthography Sierpinski), also called the Sierpinski gasket or the Sierpinski Sieve, 
+is a fractal and attractive fixed set with the overall shape of an equilateral triangle, subdivided recursively into smaller 
+equilateral triangles. Originally constructed as a curve, this is one of the basic examples of self-similar sets, i.e., 
+it is a mathematically generated pattern that can be reproducible at any magnification or reduction. It is named after 
+the Polish mathematician Wacław Sierpinski, but appeared as a decorative pattern many centuries prior to the work of Sierpinski.
+
+Requirements(pip):
+  - turtle
+
+Python:
+  - 2.6
+
+Usage:
+  - $python sierpinski_triangle.py <int:depth_for_fractal>
+
+Credits: This code was written by editing the code from http://www.lpb-riannetrujillo.com/blog/python-fractal/
+
+'''
+import turtle
+import sys
+PROGNAME = 'Sierpinski Triangle'
+if len(sys.argv) !=2: 
+    raise Exception('right format for using this script: $python fractals.py <int:depth_for_fractal>')
+
+myPen = turtle.Turtle()
+myPen.ht()
+myPen.speed(5)
+myPen.pencolor('red')
+
+points = [[-175,-125],[0,175],[175,-125]] #size of triangle
+
+def getMid(p1,p2):
+    return ( (p1[0]+p2[0]) / 2, (p1[1] + p2[1]) / 2) #find midpoint
+
+def triangle(points,depth):
+
+    myPen.up()
+    myPen.goto(points[0][0],points[0][1])
+    myPen.down()
+    myPen.goto(points[1][0],points[1][1])
+    myPen.goto(points[2][0],points[2][1])
+    myPen.goto(points[0][0],points[0][1])
+
+    if depth>0:
+        triangle([points[0],
+                        getMid(points[0], points[1]),
+                        getMid(points[0], points[2])],
+                   depth-1)
+        triangle([points[1],
+                        getMid(points[0], points[1]),
+                        getMid(points[1], points[2])],
+                   depth-1)
+        triangle([points[2],
+                         getMid(points[2], points[1]),
+                         getMid(points[0], points[2])],
+                   depth-1)
+
+
+triangle(points,int(sys.argv[1]))
diff --git a/tf_idf_generator.py b/tf_idf_generator.py
@@ -0,0 +1,128 @@
+'''@Author: Anurag Kumar(mailto:anuragkumarak95@gmail.com) 
+This module is used for generating a TF-IDF file or values from a list of files that contains docs.
+
+What is TF-IDF : https://en.wikipedia.org/wiki/Tf%E2%80%93idf
+
+python:
+  - 3.5
+
+pre-requisites: 
+  - colorama==0.3.9 
+
+sample file format of input:
+
+    ##START(NOT INCLUDED)
+    sport smile today because signs Gemini
+    little sister dealt severe allergy figure
+    about looks gender color attitude nationality respect
+    added video playlist Sonic Fightstick Edition
+    weeks birthday scott wants camping keeper
+    photo taking photo trying auction scale photo
+    happy creatively capture story stage magical
+    yoongi looks seokjin looking yoongi looking seokjin
+    taking glasses because buffering cannot handle
+    tried Michelle Obama proceeded defend whole pointless
+    robbed shades backstage reading guess karma stealing
+    remains sailors destroyer McCain collision found
+    timeline beginnings infographics Catch upcoming debut
+    ##END(NOT INCLUDED)
+
+here, every line represents a document.
+
+have fun, cheers.
+'''
+import os, math, pickle
+from colorama import Fore, Style
+
+switcher = {
+    'r':Fore.RED,
+    'bk':Fore.BLACK,
+    'b':Fore.BLUE,
+    'g':Fore.GREEN,
+    'y':Fore.YELLOW,
+    'm':Fore.MAGENTA,
+    'c':Fore.CYAN,
+    'w':Fore.WHITE
+}
+def paint(str,color='r'):
+    '''Utility func, for printing colorful logs in console...
+
+    @args:
+    --
+    str : String to be modified.
+    color : color code to which the string will be formed. default is 'r'=RED
+
+    @returns:
+    --
+    str : final modified string with foreground color as per parameters.
+
+    '''
+    if color in switcher:
+        str = switcher[color]+str+Style.RESET_ALL
+    return str
+
+TAG = paint('TF-IDF-GENE/','b')
+def find_tf_idf(file_names=['./../test/testdata'],prev_file_path=None, dump_path=None):
+    '''Function to create a TF-IDF list of dictionaries for a corpus of docs.
+    If you opt for dumping the data, you can provide a file_path with .tfidfpkl extension(standard made for better understanding)
+    and also re-generate a new tfidf list which overrides over an old one by mentioning its path.
+
+    @Args:
+    --
+    file_names : paths of files to be processed on, you can give many small sized file, rather than one large file.
+    prev_file_path : path of old .tfidfpkl file, if available. (default=None)
+    dump_path : directory-path where to dump generated lists.(default=None)
+
+    @returns:
+    --
+    idf : a dict of unique words in corpus,with their document frequency as values.
+    tf_idf : the generated tf-idf list of dictionaries for mentioned docs.
+    '''
+    tf_idf = [] # will hold a dict of word_count for every doc(line in a doc in this case)
+    idf = {}
+
+    # this statement is useful for altering existant tf-idf file and adding new docs in itself.(## memory is now the biggest issue)
+    if prev_file_path:
+        print(TAG,'modifying over exising file.. @',prev_file_path)
+        idf,tf_idf = pickle.load(open(prev_file_path,'rb'))
+        prev_doc_count = len(idf)
+        prev_corpus_length = len(tf_idf)
+
+    for f in file_names:
+
+        file1 = open(f,'r') # never use 'rb' for textual data, it creates something like,  {b'line-inside-the-doc'}
+        
+        #create word_count dict for all docs
+        for line in file1:
+            dict = {}
+            #find the amount of doc a word is in
+            for i in set(line.split()):
+                if i in idf: idf[i] +=1
+                else: idf[i] =1
+            for word in line.split():
+                #find the count of all words in every doc
+                if word not in dict:
+                    dict[word] = 1
+                else:
+                    dict[word] += 1
+            tf_idf.append(dict)
+        file1.close()
+
+    #calculating final TF-IDF values  for all words in all docs(line in a doc in this case)
+    for doc in tf_idf:
+        for key in doc:
+            true_idf = math.log(len(tf_idf)/idf[key])
+            true_tf = doc[key]/len(doc)
+            doc[key] = true_tf * true_idf
+
+    # do not get overwhelmed, just for logging the quantity of words that have been processed.
+    print(TAG,'Total number of unique words in corpus',len(idf),'( '+paint('++'+str(len(idf)-prev_doc_count),'g')+' )' if prev_file_path else '')
+    print(TAG,'Total number of docs in corpus:',len(tf_idf),'( '+paint('++'+str(len(tf_idf)-prev_corpus_length),'g')+' )' if prev_file_path else '')
+    
+    # dump if a dir-path is given
+    if dump_path:
+        if dump_path[-8:] != 'tfidfpkl': raise Exception(TAG+"Please provide a .tfidfpkl file_path, it is the standard format of this module.")
+        pickle.dump((idf,tf_idf),open(dump_path,'wb'),protocol=pickle.HIGHEST_PROTOCOL)
+        print(TAG,'Dumping TF-IDF vars @',dump_path)
+    return idf,tf_idf
+