Skip to content

Commit

Permalink
performance update to URLAnalysis
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5648 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Feb 24, 2009
1 parent 8444357 commit 0f6fa80
Showing 1 changed file with 17 additions and 8 deletions.
25 changes: 17 additions & 8 deletions source/de/anomic/data/URLAnalysis.java
Expand Up @@ -35,11 +35,14 @@
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;

import de.anomic.kelondro.util.MemoryControl;
import de.anomic.yacy.yacyURL;

public class URLAnalysis {
Expand Down Expand Up @@ -69,13 +72,14 @@ public splitter(ArrayBlockingQueue<yacyURL> in, ConcurrentHashMap<String, Intege

public void run() {
yacyURL url;
Pattern p = Pattern.compile("~|\\(|\\)|\\+|-|@|:|%|\\.|;|_");
while (true) {
try {
url = in.take();
if (url == poison) break;
//System.out.println(url);
update(url.getHost().replaceAll("-", "\\.").split("\\."));
update(url.getPath().replaceAll("~", "/").replaceAll("\\(", "/").replaceAll("\\)", "/").replaceAll("\\+", "/").replaceAll("-", "/").replaceAll("@", "/").replaceAll(":", "/").replaceAll("%", "/").replaceAll("\\.", "/").replaceAll(";", "/").replaceAll("_", "/").split("/"));
update(p.matcher(url.getPath()).replaceAll("/").split("/"));
} catch (InterruptedException e) {
e.printStackTrace();
}
Expand Down Expand Up @@ -108,6 +112,7 @@ public static void main(String[] args) {
File outfile = new File(analysis);
BufferedReader reader = null;
long time = System.currentTimeMillis();
long start = time;
int count = 0;

System.out.println("start processing");
Expand All @@ -125,9 +130,9 @@ public static void main(String[] args) {
}
}
count++;
if (System.currentTimeMillis() - time > 10000) {
if (System.currentTimeMillis() - time > 1000) {
time = System.currentTimeMillis();
System.out.println("processed " + count + " urls.");
System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second");
}
}
reader.close();
Expand All @@ -154,12 +159,16 @@ public static void main(String[] args) {
System.out.println("start processing results");
TreeMap<String, Integer> results = new TreeMap<String, Integer>();
count = 0;
for (Map.Entry<String, Integer> entry: out.entrySet()) {
Map.Entry<String, Integer> entry;
Iterator<Map.Entry<String, Integer>> i = out.entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
results.put(num(entry.getValue().intValue() * (entry.getKey().length() - 1)) + " - " + entry.getKey(), entry.getValue());
count++;
i.remove(); // free memory
if (System.currentTimeMillis() - time > 10000) {
time = System.currentTimeMillis();
System.out.println("processed " + count + " results.");
System.out.println("processed " + count + " results, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
}
}

Expand All @@ -168,10 +177,10 @@ public static void main(String[] args) {
try {
BufferedOutputStream os = new BufferedOutputStream(new FileOutputStream(outfile));
count = 0;
for (Map.Entry<String, Integer> entry: results.entrySet()) {
os.write(entry.getKey().getBytes());
for (Map.Entry<String, Integer> e: results.entrySet()) {
os.write(e.getKey().getBytes());
os.write(new byte[]{'\t'});
os.write(("" + entry.getValue()).getBytes());
os.write(("" + e.getValue()).getBytes());
os.write(new byte[]{'\n'});
count++;
if (System.currentTimeMillis() - time > 10000) {
Expand Down

0 comments on commit 0f6fa80

Please sign in to comment.