forked from klb3713/sentence2vec
-
Notifications
You must be signed in to change notification settings - Fork 0
/
nytcorpus.py
40 lines (32 loc) · 887 Bytes
/
nytcorpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
"""
This is the script that compresses
NYT Corpus
"""
import glob
dirloc = "E:\\Allen\\NYTFuture\\NYT"
files = glob.glob(dirloc + "\\*.txt")
output = "E:\\Allen\\R\\emnlp2015\\word2vec\\nyt_syn_lex.txt"
wfile = open(output, 'wb')
a = 0
def clean(word):
word = word.replace(',','')
word = word.replace('.','')
word = word.replace("'",'')
word = word.replace('"','')
word = word.replace('(','')
word = word.replace(')','')
# word = word.replace('\xe9','')
return word
def ensure_unicode(v):
if isinstance(v, str):
v = v.decode('utf8')
return unicode(v) # convert anything not a string to unicode too
for f in files:
with open(f, 'r') as content_file:
next(content_file) #skip header row
for line in content_file:
a += 1
wfile.write(clean(line.split("\t")[2]) + "\r\n")
if a % 10000 == 0:
print(a)
wfile.close()