forked from knowledgehives-opensource/openvocabulary
/
fixuri.py
70 lines (58 loc) · 1.84 KB
/
fixuri.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# encoding: utf-8
"""
fixme.py
Created by Sebastian Kruk on .
Copyright (c) Knowledge Hives sp. z o.o.. All rights reserved.
"""
import sys
import gc
import re
import getopt
import time
import datetime
from django.core.management import setup_environ
import settings
settings.DEBUG = False
setup_environ(settings)
from django.db import connection
assert not connection.queries, 'settings.DEBUG=True?'
from ov.models import *
from ov.importer import *
name_pat = re.compile(r"^(?:.+)[/](?P<label>[^/]+)[/]?$")
words_pat = re.compile(r"^(?:[A-Z][a-z]+){2,}$")
word_pat = re.compile(r"(?P<word>[A-Z][a-z]+)")
gc.enable()
print "will fix labels now (%s)"%sys.getfilesystemencoding()
date = time.mktime(datetime.datetime.utcnow().timetuple())
i=0
for uri in URI.objects.iterator():
print "%d | %s" % (i, str(uri))
i+=1
# retrieve label from URI
if not uri.label:
m = name_pat.match(uri.uri)
if m:
gdict = m.groupdict()
label = gdict['label']
if label:
if words_pat.match(label):
l = ""
for m in word_pat.finditer(label):
if m.start() > 0:
l += m.group().lower()+" "
else:
l += m.group()+" "
label = l.strip()
# continue
uri.label = label.replace('_', ' ')
uri.save()
print "extracted label for %s" % str(uri)
else:
print u"Did not find label for %s " % str(uri)
else:
print u"Could not get label for %s " % str(uri)
if not i%100:
gc.collect()
print u"Completed in ", time.mktime(datetime.datetime.utcnow().timetuple()) - date