Permalink
Browse files

UrlTreeBoilerplateRemoverComponent: minor modification

EntityRecognitionEngine: conditions are now inherited along the subsumption hierarchy; it is now possible to "disable" a gazetteer or even a specific term

git-svn-id: svn://first.ijs.si:3691/latino@623 35089d11-96e9-ae4d-82a0-4b2588c93174
  • Loading branch information...
1 parent 2e8eea4 commit c31a60e51cf51c5abf89cacbce8ce47e864ac8f5 mgrcar committed May 21, 2012
@@ -29,14 +29,17 @@ private MemoryStore mRdfStore
private Logger mLogger
= Logger.GetLogger(typeof(EntityRecognitionEngine));
- private static IStemmer mLemmatizer // make configurable?
+ private string mDefaultInstanceClass
+ = "http://project-first.eu/ontology#SentimentObject"; // TODO: make configurable
+
+ private static IStemmer mLemmatizer // *** make configurable?
= new Lemmatizer(Language.English);
// see http://www.regular-expressions.info/unicode.html for Unicode character classes
private static Regex mMicroTokenRegex
- = new Regex(@"[\d\p{L}]+", RegexOptions.Compiled); // *** currency symbols?
+ = new Regex(@"[\d\p{L}]+", RegexOptions.Compiled); // *** currency symbols, punctuation marks?
private static Regex mGazetteerMicroTokenRegex
- = new Regex(@"[\d\p{L}]+(/\p{L}+)?", RegexOptions.Compiled); // *** currency symbols?
+ = new Regex(@"[\d\p{L}]+(/\p{L}+)?", RegexOptions.Compiled); // *** currency symbols, punctuation marks?
private static Regex mConstraintRegex
= new Regex(@"(/\p{L}+=\p{L}+)+", RegexOptions.Compiled);
@@ -50,11 +53,11 @@ private static Entity P_STOP_WORD
= NAMESPACE + "stopWord";
private static Entity P_IMPORTS
= NAMESPACE + "imports";
- private static Entity P_SENTENCE_LEVEL_CONDITION
+ private static Entity P_HAS_SENTENCE_LEVEL_CONDITION
= NAMESPACE + "hasSentenceLevelCondition";
- private static Entity P_TEXTBLOCK_LEVEL_CONDITION
- = NAMESPACE + "hasTextBlockLevelCondition";
- private static Entity P_DOCUMENT_LEVEL_CONDITION
+ private static Entity P_HAS_BLOCK_LEVEL_CONDITION
+ = NAMESPACE + "hasBlockLevelCondition";
+ private static Entity P_HAS_DOCUMENT_LEVEL_CONDITION
= NAMESPACE + "hasDocumentLevelCondition";
private static Entity P_IDENTIFIED_BY
= NAMESPACE + "identifiedBy";
@@ -106,7 +109,7 @@ private class Condition
public enum Level
{
Sentence,
- TextBlock,
+ Block,
Document
}
@@ -157,38 +160,38 @@ public Sentence(IEnumerable<string> tokens, IEnumerable<int> spanInfo, IEnumerab
}
}
- private bool Match(GazetteerToken gToken, Token dToken, CaseMatchingType caseMatchingType, bool firstToken)
+ private bool Match(GazetteerToken gazToken, Token docToken, CaseMatchingType caseMatchingType, bool firstToken)
{
// check POS tag
- if (gToken.mPosConstraint != null && !dToken.mPosTag.StartsWith(gToken.mPosConstraint)) { return false; }
+ if (gazToken.mPosConstraint != null && !docToken.mPosTag.StartsWith(gazToken.mPosConstraint)) { return false; }
// check word or lemma
- string gTokenStr;
- string dTokenStr;
- if (gToken.mLemma == null)
+ string gazTokenStr;
+ string docTokenStr;
+ if (gazToken.mLemma == null)
{
- gTokenStr = gToken.mTokenStr;
- dTokenStr = dToken.mTokenStr;
+ gazTokenStr = gazToken.mTokenStr;
+ docTokenStr = docToken.mTokenStr;
}
else
{
- gTokenStr = gToken.mLemma;
- dTokenStr = dToken.mLemma;
+ gazTokenStr = gazToken.mLemma;
+ docTokenStr = docToken.mLemma;
}
switch (caseMatchingType)
{
case CaseMatchingType.IgnoreCase:
- return string.Compare(gTokenStr, dTokenStr, StringComparison.OrdinalIgnoreCase) == 0;
+ return string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0;
case CaseMatchingType.ExactMatch:
case CaseMatchingType.AllLowercase:
case CaseMatchingType.AllUppercase:
case CaseMatchingType.AllCapsStrict:
case CaseMatchingType.InitCapStrict:
- return gTokenStr == dTokenStr;
+ return gazTokenStr == docTokenStr;
case CaseMatchingType.InitCapLoose:
- return (!firstToken && string.Compare(gTokenStr, dTokenStr, StringComparison.OrdinalIgnoreCase) == 0)
- || (firstToken && char.IsUpper(dTokenStr[0]) && string.Compare(gTokenStr, dTokenStr, StringComparison.OrdinalIgnoreCase) == 0);
+ return (!firstToken && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0)
+ || (firstToken && char.IsUpper(docTokenStr[0]) && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0);
case CaseMatchingType.AllCapsLoose:
- return char.IsUpper(dTokenStr[0]) && string.Compare(gTokenStr, dTokenStr, StringComparison.OrdinalIgnoreCase) == 0;
+ return char.IsUpper(docTokenStr[0]) && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0;
default:
throw new ArgumentValueException("caseMatchingType");
}
@@ -198,7 +201,8 @@ public void Match(Gazetteer gazetteer, out ArrayList<Pair<int, int>> spans)
{
spans = new ArrayList<Pair<int, int>>();
foreach (GazetteerTerm term in gazetteer.mTerms)
- {
+ {
+ if (!term.mEnabled) { continue; }
int lastIdx = mTokens.Count - term.mTokens.Count;
for (int i = 0; i <= lastIdx; i++)
{
@@ -333,15 +337,15 @@ Set<Gazetteer> documentEntityInfo
{
if (condition.mLevel == Condition.Level.Document)
{
- if (!documentEntityInfo.Contains(condition.mGazetteer)) { /*Console.WriteLine("!");*/ valid = false; break; }
+ if (!documentEntityInfo.Contains(condition.mGazetteer)) { valid = false; break; }
}
- else if (condition.mLevel == Condition.Level.TextBlock)
+ else if (condition.mLevel == Condition.Level.Block)
{
- if (!textBlockGazetteers.Contains(condition.mGazetteer)) { /*Console.WriteLine("!!");*/ valid = false; break; }
+ if (!textBlockGazetteers.Contains(condition.mGazetteer)) { valid = false; break; }
}
else if (condition.mLevel == Condition.Level.Sentence)
{
- if (!sentenceInfo.Value.ContainsKey(condition.mGazetteer)) { /*Console.WriteLine("!!!");*/ valid = false; break; }
+ if (!sentenceInfo.Value.ContainsKey(condition.mGazetteer)) { valid = false; break; }
}
}
if (valid)
@@ -409,6 +413,7 @@ private class GazetteerTerm
public ArrayList<GazetteerToken> mTokens
= new ArrayList<GazetteerToken>();
public CaseMatchingType mCaseMatchingType;
+ public bool mEnabled;
private void PrepareTokens(CaseMatchingType caseMatchingType, bool processLemmas)
{
@@ -450,9 +455,10 @@ private void PrepareTokens(CaseMatchingType caseMatchingType, bool processLemmas
}
}
- private void InitializeInstance(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, Gazetteer gazetteer)
+ private void InitializeInstance(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer)
{
mCaseMatchingType = caseMatchingType;
+ mEnabled = enabled;
IEnumerator<string> enumTokens = tokens.GetEnumerator();
IEnumerator<string> enumPosConstraints = posConstraints.GetEnumerator();
while (enumTokens.MoveNext() && enumPosConstraints.MoveNext())
@@ -477,19 +483,20 @@ private void InitializeInstance(IEnumerable<string> tokens, IEnumerable<string>
}
}
- public GazetteerTerm(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, Gazetteer gazetteer)
- {
- InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, gazetteer);
- }
+ //public GazetteerTerm(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer)
+ //{
+ // InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, enabled, gazetteer);
+ //}
- public GazetteerTerm(string termDef, Gazetteer gazetteer, CaseMatchingType defaultCaseMatchingType, bool defaultLemmatizeFlag)
+ public GazetteerTerm(string termDef, Gazetteer gazetteer, CaseMatchingType defaultCaseMatchingType, bool defaultLemmatizeFlag, bool defaultEnabledFlag)
{
// default settings
CaseMatchingType caseMatchingType = defaultCaseMatchingType;
bool lemmatize = defaultLemmatizeFlag;
+ bool enabled = defaultEnabledFlag;
// parse term settings
termDef = mConstraintRegex.Replace(termDef, new MatchEvaluator(delegate(Match m) {
- ParseGazetteerSettings(m.Value, out caseMatchingType, out lemmatize);
+ ParseGazetteerSettings(m.Value, ref caseMatchingType, ref lemmatize, ref enabled);
return "";
}));
ArrayList<string> tokens = new ArrayList<string>();
@@ -509,7 +516,7 @@ public GazetteerTerm(string termDef, Gazetteer gazetteer, CaseMatchingType defau
posConstraints.Add(posConstraint);
match = match.NextMatch();
}
- InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, gazetteer);
+ InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, enabled, gazetteer);
}
public string GetLemma()
@@ -551,6 +558,8 @@ public ArrayList<Gazetteer> mImportedGazetteers
= new ArrayList<Gazetteer>();
public ArrayList<Condition> mConditions
= new ArrayList<Condition>();
+ public bool mEnabled
+ = true;
public Gazetteer(string uri)
{
@@ -588,27 +597,48 @@ public void ImportGazetteers(MemoryStore rdfStore, Dictionary<string, Gazetteer>
public void ReadConditions(MemoryStore rdfStore, Dictionary<string, Gazetteer> gazetteers)
{
- Resource[] conditionGazetteers = rdfStore.SelectObjects(mUri, P_SENTENCE_LEVEL_CONDITION);
- foreach (Entity conditionGazetteer in conditionGazetteers)
- {
- mConditions.Add(new Condition(gazetteers[conditionGazetteer.Uri], Condition.Level.Sentence));
- }
- conditionGazetteers = rdfStore.SelectObjects(mUri, P_TEXTBLOCK_LEVEL_CONDITION);
- foreach (Entity conditionGazetteer in conditionGazetteers)
+ ArrayList<string> crumbs = new ArrayList<string>(new string[] { mUri });
+ Entity[] objects = rdfStore.SelectSubjects(P_IDENTIFIED_BY, new Entity(mUri));
+ if (objects.Length > 0)
{
- mConditions.Add(new Condition(gazetteers[conditionGazetteer.Uri], Condition.Level.TextBlock));
+ Resource[] objTypes = rdfStore.SelectObjects(objects[0].Uri, P_TYPE);
+ if (objTypes.Length > 0)
+ {
+ crumbs.Add(objTypes[0].Uri);
+ Resource[] superClass = rdfStore.SelectObjects((Entity)objTypes[0], P_SUBCLASS_OF);
+ while (superClass.Length > 0)
+ {
+ crumbs.Add(superClass[0].Uri);
+ superClass = rdfStore.SelectObjects((Entity)superClass[0], P_SUBCLASS_OF);
+ }
+ }
}
- conditionGazetteers = rdfStore.SelectObjects(mUri, P_DOCUMENT_LEVEL_CONDITION);
- foreach (Entity conditionGazetteer in conditionGazetteers)
+ crumbs.Reverse();
+ foreach (string uri in crumbs)
{
- mConditions.Add(new Condition(gazetteers[conditionGazetteer.Uri], Condition.Level.Document));
+ Resource[] conditionGazetteers = rdfStore.SelectObjects(uri, P_HAS_SENTENCE_LEVEL_CONDITION);
+ foreach (Entity conditionGazetteer in conditionGazetteers)
+ {
+ mConditions.Add(new Condition(gazetteers[conditionGazetteer.Uri], Condition.Level.Sentence));
+ }
+ conditionGazetteers = rdfStore.SelectObjects(uri, P_HAS_BLOCK_LEVEL_CONDITION);
+ foreach (Entity conditionGazetteer in conditionGazetteers)
+ {
+ mConditions.Add(new Condition(gazetteers[conditionGazetteer.Uri], Condition.Level.Block));
+ }
+ conditionGazetteers = rdfStore.SelectObjects(uri, P_HAS_DOCUMENT_LEVEL_CONDITION);
+ foreach (Entity conditionGazetteer in conditionGazetteers)
+ {
+ mConditions.Add(new Condition(gazetteers[conditionGazetteer.Uri], Condition.Level.Document));
+ }
}
}
- private void ReadGazetteerSettings(MemoryStore rdfStore, out CaseMatchingType caseMatchingType, out bool lemmatize)
+ private void ReadGazetteerSettings(MemoryStore rdfStore, out CaseMatchingType caseMatchingType, out bool lemmatize, out bool enabled)
{
caseMatchingType = CaseMatchingType.IgnoreCase;
lemmatize = false;
+ enabled = true;
ArrayList<string> crumbs = new ArrayList<string>(new string[] { mUri });
Entity[] objects = rdfStore.SelectSubjects(P_IDENTIFIED_BY, new Entity(mUri));
if (objects.Length > 0)
@@ -633,7 +663,7 @@ private void ReadGazetteerSettings(MemoryStore rdfStore, out CaseMatchingType ca
if (settings.Length > 0)
{
string settingsStr = ((Literal)settings[0]).Value;
- ParseGazetteerSettings(settingsStr, out caseMatchingType, out lemmatize);
+ ParseGazetteerSettings(settingsStr, ref caseMatchingType, ref lemmatize, ref enabled);
}
}
}
@@ -644,14 +674,14 @@ public void ReadTerms(MemoryStore rdfStore)
//Console.WriteLine("*** " + mUri + " ***");
CaseMatchingType caseMatchingType;
bool lemmatize;
- ReadGazetteerSettings(rdfStore, out caseMatchingType, out lemmatize);
+ ReadGazetteerSettings(rdfStore, out caseMatchingType, out lemmatize, out mEnabled);
// read terms
Resource[] terms = rdfStore.SelectObjects(mUri, P_TERM);
Set<string> skipList = new Set<string>();
foreach (Literal term in terms)
{
//Console.Write(term.Value + " -> ");
- GazetteerTerm termObj = new GazetteerTerm(term.Value, /*gazetteer=*/this, caseMatchingType, lemmatize);
+ GazetteerTerm termObj = new GazetteerTerm(term.Value, /*gazetteer=*/this, caseMatchingType, lemmatize, mEnabled);
string termStr = termObj.ToString();
if (termObj.mTokens.Count > 0 && !skipList.Contains(termStr))
{
@@ -708,6 +738,7 @@ public ArrayList<string> GetInstanceClassPath(string instanceUri)
{
ArrayList<string> crumbs = new ArrayList<string>();
string instanceClass = GetInstanceClass(instanceUri);
+ if (instanceClass == null) { instanceClass = mDefaultInstanceClass; }
crumbs.Add(instanceClass);
Resource[] superClass = mRdfStore.SelectObjects(instanceClass, P_SUBCLASS_OF);
while (superClass.Length > 0)
@@ -780,21 +811,23 @@ private static string Normalize(string str)
return RemoveDiacritics(strNrm);
}
- private static void ParseGazetteerSettings(string settingsStr, out CaseMatchingType caseMatchingType, out bool lemmatize)
+ private static void ParseGazetteerSettings(string settingsStr, ref CaseMatchingType caseMatchingType, ref bool lemmatize, ref bool enabled)
{
- caseMatchingType = CaseMatchingType.IgnoreCase;
- lemmatize = false;
string[] settings = settingsStr.TrimStart('/').Split('/');
foreach (string setting in settings)
{
string[] keyVal = setting.Split('=');
if (keyVal.Length == 2)
{
- if (keyVal[0] == "l") // lemmatization setting
+ if (keyVal[0] == "e") // enabled
+ {
+ enabled = keyVal[1] != "n";
+ }
+ else if (keyVal[0] == "l") // lemmatize
{
lemmatize = keyVal[1] == "y";
}
- else if (keyVal[0] == "c") // case-matching setting
+ else if (keyVal[0] == "c") // case-matching type
{
if (keyVal[1] == "ic") { caseMatchingType = CaseMatchingType.IgnoreCase; }
else if (keyVal[1] == "em") { caseMatchingType = CaseMatchingType.ExactMatch; }
@@ -249,7 +249,7 @@ private void AddToUrlCache(string urlKey, DateTime time, Pair<Dictionary<string,
private void RemoveItems(Pair<Dictionary<string, Ref<int>>, Queue<UrlHistoryEntry>> urlInfo, Pair<UrlTree, Queue<TextBlockHistoryEntry>> textBlockInfo, DateTime time)
{
double ageDays = 0;
- while (urlInfo.Second.Count > mMinQueueSize && (urlInfo.Second.Count > mMaxQueueSize || (ageDays = (time - urlInfo.Second.Peek().mTime).TotalDays) > (double)mHistoryAgeDays))
+ while (urlInfo.Second.Count > mMinQueueSize && ((ageDays = (time - urlInfo.Second.Peek().mTime).TotalDays) > (double)mHistoryAgeDays || urlInfo.Second.Count > mMaxQueueSize))
{
string rmvUrlKey = urlInfo.Second.Dequeue().mUrlKey;
if (rmvUrlKey != null) { urlInfo.First.Remove(rmvUrlKey); }

0 comments on commit c31a60e

Please sign in to comment.