Skip to content

Commit 00c291f

Browse files
Merge pull request #232 from anuragkumarak95/master
Tf-IDF Generator module and Sierpinski triangle fractal draw
2 parents 62f78bf + 06167db commit 00c291f

File tree

2 files changed

+192
-0
lines changed

2 files changed

+192
-0
lines changed

sierpinski_triangle.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
'''Author Anurag Kumar | anuragkumarak95@gmail.com | git/anuragkumarak95
2+
3+
Simple example of Fractal generation using recursive function.
4+
5+
What is Sierpinski Triangle?
6+
>>The Sierpinski triangle (also with the original orthography Sierpinski), also called the Sierpinski gasket or the Sierpinski Sieve,
7+
is a fractal and attractive fixed set with the overall shape of an equilateral triangle, subdivided recursively into smaller
8+
equilateral triangles. Originally constructed as a curve, this is one of the basic examples of self-similar sets, i.e.,
9+
it is a mathematically generated pattern that can be reproducible at any magnification or reduction. It is named after
10+
the Polish mathematician Wacław Sierpinski, but appeared as a decorative pattern many centuries prior to the work of Sierpinski.
11+
12+
Requirements(pip):
13+
- turtle
14+
15+
Python:
16+
- 2.6
17+
18+
Usage:
19+
- $python sierpinski_triangle.py <int:depth_for_fractal>
20+
21+
Credits: This code was written by editing the code from http://www.lpb-riannetrujillo.com/blog/python-fractal/
22+
23+
'''
24+
import turtle
25+
import sys
26+
PROGNAME = 'Sierpinski Triangle'
27+
if len(sys.argv) !=2:
28+
raise Exception('right format for using this script: $python fractals.py <int:depth_for_fractal>')
29+
30+
myPen = turtle.Turtle()
31+
myPen.ht()
32+
myPen.speed(5)
33+
myPen.pencolor('red')
34+
35+
points = [[-175,-125],[0,175],[175,-125]] #size of triangle
36+
37+
def getMid(p1,p2):
38+
return ( (p1[0]+p2[0]) / 2, (p1[1] + p2[1]) / 2) #find midpoint
39+
40+
def triangle(points,depth):
41+
42+
myPen.up()
43+
myPen.goto(points[0][0],points[0][1])
44+
myPen.down()
45+
myPen.goto(points[1][0],points[1][1])
46+
myPen.goto(points[2][0],points[2][1])
47+
myPen.goto(points[0][0],points[0][1])
48+
49+
if depth>0:
50+
triangle([points[0],
51+
getMid(points[0], points[1]),
52+
getMid(points[0], points[2])],
53+
depth-1)
54+
triangle([points[1],
55+
getMid(points[0], points[1]),
56+
getMid(points[1], points[2])],
57+
depth-1)
58+
triangle([points[2],
59+
getMid(points[2], points[1]),
60+
getMid(points[0], points[2])],
61+
depth-1)
62+
63+
64+
triangle(points,int(sys.argv[1]))

tf_idf_generator.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
'''@Author: Anurag Kumar(mailto:anuragkumarak95@gmail.com)
2+
This module is used for generating a TF-IDF file or values from a list of files that contains docs.
3+
4+
What is TF-IDF : https://en.wikipedia.org/wiki/Tf%E2%80%93idf
5+
6+
python:
7+
- 3.5
8+
9+
pre-requisites:
10+
- colorama==0.3.9
11+
12+
sample file format of input:
13+
14+
##START(NOT INCLUDED)
15+
sport smile today because signs Gemini
16+
little sister dealt severe allergy figure
17+
about looks gender color attitude nationality respect
18+
added video playlist Sonic Fightstick Edition
19+
weeks birthday scott wants camping keeper
20+
photo taking photo trying auction scale photo
21+
happy creatively capture story stage magical
22+
yoongi looks seokjin looking yoongi looking seokjin
23+
taking glasses because buffering cannot handle
24+
tried Michelle Obama proceeded defend whole pointless
25+
robbed shades backstage reading guess karma stealing
26+
remains sailors destroyer McCain collision found
27+
timeline beginnings infographics Catch upcoming debut
28+
##END(NOT INCLUDED)
29+
30+
here, every line represents a document.
31+
32+
have fun, cheers.
33+
'''
34+
import os, math, pickle
35+
from colorama import Fore, Style
36+
37+
switcher = {
38+
'r':Fore.RED,
39+
'bk':Fore.BLACK,
40+
'b':Fore.BLUE,
41+
'g':Fore.GREEN,
42+
'y':Fore.YELLOW,
43+
'm':Fore.MAGENTA,
44+
'c':Fore.CYAN,
45+
'w':Fore.WHITE
46+
}
47+
def paint(str,color='r'):
48+
'''Utility func, for printing colorful logs in console...
49+
50+
@args:
51+
--
52+
str : String to be modified.
53+
color : color code to which the string will be formed. default is 'r'=RED
54+
55+
@returns:
56+
--
57+
str : final modified string with foreground color as per parameters.
58+
59+
'''
60+
if color in switcher:
61+
str = switcher[color]+str+Style.RESET_ALL
62+
return str
63+
64+
TAG = paint('TF-IDF-GENE/','b')
65+
def find_tf_idf(file_names=['./../test/testdata'],prev_file_path=None, dump_path=None):
66+
'''Function to create a TF-IDF list of dictionaries for a corpus of docs.
67+
If you opt for dumping the data, you can provide a file_path with .tfidfpkl extension(standard made for better understanding)
68+
and also re-generate a new tfidf list which overrides over an old one by mentioning its path.
69+
70+
@Args:
71+
--
72+
file_names : paths of files to be processed on, you can give many small sized file, rather than one large file.
73+
prev_file_path : path of old .tfidfpkl file, if available. (default=None)
74+
dump_path : directory-path where to dump generated lists.(default=None)
75+
76+
@returns:
77+
--
78+
idf : a dict of unique words in corpus,with their document frequency as values.
79+
tf_idf : the generated tf-idf list of dictionaries for mentioned docs.
80+
'''
81+
tf_idf = [] # will hold a dict of word_count for every doc(line in a doc in this case)
82+
idf = {}
83+
84+
# this statement is useful for altering existant tf-idf file and adding new docs in itself.(## memory is now the biggest issue)
85+
if prev_file_path:
86+
print(TAG,'modifying over exising file.. @',prev_file_path)
87+
idf,tf_idf = pickle.load(open(prev_file_path,'rb'))
88+
prev_doc_count = len(idf)
89+
prev_corpus_length = len(tf_idf)
90+
91+
for f in file_names:
92+
93+
file1 = open(f,'r') # never use 'rb' for textual data, it creates something like, {b'line-inside-the-doc'}
94+
95+
#create word_count dict for all docs
96+
for line in file1:
97+
dict = {}
98+
#find the amount of doc a word is in
99+
for i in set(line.split()):
100+
if i in idf: idf[i] +=1
101+
else: idf[i] =1
102+
for word in line.split():
103+
#find the count of all words in every doc
104+
if word not in dict:
105+
dict[word] = 1
106+
else:
107+
dict[word] += 1
108+
tf_idf.append(dict)
109+
file1.close()
110+
111+
#calculating final TF-IDF values for all words in all docs(line in a doc in this case)
112+
for doc in tf_idf:
113+
for key in doc:
114+
true_idf = math.log(len(tf_idf)/idf[key])
115+
true_tf = doc[key]/len(doc)
116+
doc[key] = true_tf * true_idf
117+
118+
# do not get overwhelmed, just for logging the quantity of words that have been processed.
119+
print(TAG,'Total number of unique words in corpus',len(idf),'( '+paint('++'+str(len(idf)-prev_doc_count),'g')+' )' if prev_file_path else '')
120+
print(TAG,'Total number of docs in corpus:',len(tf_idf),'( '+paint('++'+str(len(tf_idf)-prev_corpus_length),'g')+' )' if prev_file_path else '')
121+
122+
# dump if a dir-path is given
123+
if dump_path:
124+
if dump_path[-8:] != 'tfidfpkl': raise Exception(TAG+"Please provide a .tfidfpkl file_path, it is the standard format of this module.")
125+
pickle.dump((idf,tf_idf),open(dump_path,'wb'),protocol=pickle.HIGHEST_PROTOCOL)
126+
print(TAG,'Dumping TF-IDF vars @',dump_path)
127+
return idf,tf_idf
128+

0 commit comments

Comments
 (0)