Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

* removed unused files

  • Loading branch information...
commit 75cbd151e2a46620330854154b40764c1d661086 1 parent 5df3672
Yusuke Matsubara authored February 27, 2012
35  lcp.py
... ...
@@ -1,35 +0,0 @@
1  
-from linsuffarr import *
2  
-
3  
-def longest_common_substring(text, freq, length):
4  
-    sa = SuffixArray(text, unit=UNIT_CHARACTER)
5  
-    lcp = sa._LCP_values
6  
-    sa = sa.SA
7  
-    ret = ''
8  
-    i = 0
9  
-    while i < len(sa):
10  
-        f = 1
11  
-        while i + 1 < len(sa) and lcp[i+1] >= length:
12  
-            f += 1
13  
-            i += 1
14  
-        if f >= freq and lcp[i] > len(ret):
15  
-            ret = text[sa[i-1] : sa[i-1]+lcp[i]]
16  
-        if f == 1:
17  
-            i += 1
18  
-    return ret
19  
-
20  
-def common_substring(a, freq, length):
21  
-    text = ''
22  
-    anchor = 1
23  
-    for x in a:
24  
-        text += x + chr(anchor)
25  
-        anchor = (anchor + 1) % 256
26  
-    return longest_common_substring(text, freq, length)
27  
-
28  
-if __name__ == '__main__':
29  
-    a = ['[[File:Stop hand nuvola.svg|30px|alt=|link=]] \'\'\'This is your last warning\'\'\'. You will be blocked from editing the next time you vandalize a page, as you did with <span class="plainlinks">[{{{2}}} this edit]</span> to [[:{{{1}}}]]. <!-- Uw-vandalism4 --><!-- Template:Huggle/warn-4 --> ~~<noinclude></noinclude>~~<noinclude> [[pt:Predefinio:Huggle/warn-4]] </noinclude>',
30  
-         '<div style=clear: both></div>{{<includeonly>safesubst:</includeonly>Huggle/uw-4 |page=[[:{{{1}}}]] with [{{{2}}} this edit] |extra=~~<noinclude></noinclude>~~ |reason={{{reason|[[Wikipedia:Vandalism|vandalize]] Wikipedia}}} }}<!-- Template:uw-vandalism4 --><noinclude> {{Huggle/TemplateNotice|series = uw-vandalism|max = 4im|s1 = uw-v4|s2 = uw-vand4|s3 = uw-vandal4|nothankyou=yes}} </noinclude>',
31  
-         '<div style=clear: both></div>{{<includeonly>safesubst:</includeonly>Huggle/uw-4 |page=[[:{{{1}}}]] with <span class="plainlinks">[{{{2}}} this edit]</span> |extra=~~<noinclude></noinclude>~~ |reason={{{reason|[[Wikipedia:Vandalism|vandalize]] Wikipedia}}} }}<!-- Template:Huggle/warn-4 --><noinclude> {{Huggle/TemplateNotice|series = uw-vandalism|max = 4im|s1 = uw-v4|s2 = uw-vand4|s3 = uw-vandal4|nothankyou=yes}} </noinclude>',
32  
-         '[[File:Stop hand nuvola.svg|30px|alt=|link=]] \'\'\'This is your last warning\'\'\'. You will be blocked from editing the next time you vandalize a page, as you did with <span class="plainlinks">[{{{2}}} this edit]</span> to [[:{{{1}}}]]. <!-- Uw-vandalism4 --><!-- Template:Huggle/warn-4 --> ~~<noinclude></noinclude>~~<noinclude> [[pt:Predefinio:Huggle/warn-4]] </noinclude>',
33  
-         '[[File:Stop hand nuvola.svg|30px]] \'\'\'This is your last warning\'\'\'. You may be \'\'\'blocked from editing without further warning\'\'\' the next time you vandalize a page, as you did with <span class="plainlinks">[{{{2}}} this edit]</span> to [[:{{{1}}}]]. <!-- Template:uw-huggle4 --> ~~<noinclude></noinclude>~~<noinclude> </noinclude>',
34  
-         '[[File:Stop hand nuvola.svg|30px]] \'\'\'This is your last warning\'\'\'. You may be \'\'\'blocked from editing without further warning\'\'\' the next time you vandalize a page, as you did with <span class="plainlinks">[{{{2}}} this edit]</span> to [[:{{{1}}}]]. <!-- Template:Huggle/warn-4 --><!-- Template:uw-vandalism4 --> ~~<noinclude></noinclude>~~<noinclude>']
35  
-    print common_substring(a, 5, 20)
50  src/main/java/org/wikimedia/revdiffsearch/Pair.java
... ...
@@ -1,50 +0,0 @@
1  
-package org.wikimedia.revdiffsearch;
2  
-
3  
-public class Pair<X,Y> {
4  
-  protected X first;
5  
-  protected Y second;
6  
-  public Pair(X x, Y y) {
7  
-    this.first = x;
8  
-    this.second = y;
9  
-  }
10  
-  public Pair(Pair<X,Y> p) {
11  
-    this(p.first, p.second);
12  
-  }
13  
-
14  
-  public X getFirst()  { return this.first; }
15  
-  public Y getSecond() { return this.second; }
16  
-  public void setFirst(X c)  { this.first  = c; }
17  
-  public void setSecond(Y c) { this.second = c; }
18  
-  @Override
19  
-    public boolean equals(Object o) {
20  
-    if ( this == o ) return true;
21  
-    if ( o instanceof Pair ) {
22  
-      Pair p = (Pair)o;
23  
-      return this.first.equals(p.first) && this.second.equals(p.second);
24  
-    } else {
25  
-      return false;
26  
-    }
27  
-  }
28  
-  @Override public int hashCode() {
29  
-    return this.first.hashCode() * 107 + this.second.hashCode();
30  
-  }
31  
-
32  
-  @Override
33  
-    public String toString() {
34  
-    return "(" + this.first + "," + this.second + ")";
35  
-  }
36  
-  public static <Z,W> Pair<Z,W> newInstance(Z z, W w) {
37  
-    return new Pair<Z,W>(z,w);
38  
-  }
39  
-  public static <Z,W> Pair<Z,W> newInstance(Pair<Z,W> p) {
40  
-    return new Pair<Z,W>(p);
41  
-  }
42  
-}
43  
-
44  
-/*
45  
- * Local variables:
46  
- * tab-width: 2
47  
- * c-basic-offset: 2
48  
- * indent-tabs-mode: nil
49  
- * End:
50  
- */
377  src/main/java/org/wikimedia/revdiffsearch/Searcher.java
... ...
@@ -1,377 +0,0 @@
1  
-package org.wikimedia.revdiffsearch;
2  
-
3  
-/**
4  
- * Copyright 2011 Apache Software Foundation
5  
- *
6  
- * Licensed under the Apache License, Version 2.0 (the "License");
7  
- * you may not use this file except in compliance with the License.
8  
- * You may obtain a copy of the License at
9  
- *
10  
- *     http://www.apache.org/licenses/LICENSE-2.0
11  
- *
12  
- * Unless required by applicable law or agreed to in writing, software
13  
- * distributed under the License is distributed on an "AS IS" BASIS,
14  
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  
- * See the License for the specific language governing permissions and
16  
- * limitations under the License.
17  
- */
18  
-
19  
-import java.io.BufferedReader;
20  
-import java.io.File;
21  
-import java.io.FileInputStream;
22  
-import java.io.FileWriter;
23  
-import java.io.IOException;
24  
-import java.io.InputStreamReader;
25  
-import java.text.DateFormat;
26  
-import java.text.SimpleDateFormat;
27  
-import java.util.Date;
28  
-import java.util.Iterator;
29  
-import java.util.List;
30  
-import java.util.Collection;
31  
-import java.util.ArrayList;
32  
-import org.apache.commons.lang3.StringEscapeUtils;
33  
-import au.com.bytecode.opencsv.CSVWriter;
34  
-
35  
-import org.apache.lucene.analysis.Analyzer;
36  
-import org.apache.lucene.document.Document;
37  
-import org.apache.lucene.document.Fieldable;
38  
-import org.apache.lucene.queryParser.ParseException;
39  
-import org.apache.lucene.queryParser.QueryParser;
40  
-import org.apache.lucene.search.IndexSearcher;
41  
-import org.apache.lucene.index.IndexReader;
42  
-import org.apache.lucene.search.Query;
43  
-import org.apache.lucene.search.ScoreDoc;
44  
-import org.apache.lucene.search.TopDocs;
45  
-import org.apache.lucene.store.FSDirectory;
46  
-import org.apache.lucene.util.Version;
47  
-import org.wikimedia.revdiffsearch.utils.FileUtils;
48  
-
49  
-/** Simple command-line based search demo. */
50  
-public class Searcher {
51  
-	private static final String searchKey = "rev_id";
52  
-	private static int MAX_HITS = 10000;
53  
-
54  
-	private Searcher() {
55  
-	}
56  
-
57  
-	private static String truncateString(String str, int max) {
58  
-		if (str.length() + 3 > max) {
59  
-			return str.substring(0, max - 3) + "...";
60  
-		} else {
61  
-			return str;
62  
-		}
63  
-	}
64  
-
65  
-	/** Simple command-line based search demo. */
66  
-	public static void main(String[] args) throws Exception {
67  
-		String usage = "Usage:\tjava "
68  
-				+ Searcher.class.getName()
69  
-				+ " [-index dir] [-field f] [-repeat n] [-queries file] [-query string] [-raw] [-paging hitsPerPage]\n\nSee http://lucene.apache.org/java/4_0/demo.html for details.";
70  
-		if (args.length > 0
71  
-				&& ("-h".equals(args[0]) || "-help".equals(args[0]))) {
72  
-			System.out.println(usage);
73  
-			System.exit(0);
74  
-		}
75  
-
76  
-		String index = "index";
77  
-		String field = "added";
78  
-		String queries = null;
79  
-		int repeat = 0;
80  
-		boolean raw = false;
81  
-		String queryString = null;
82  
-		int hitsPerPage = 10;
83  
-    boolean exact = false;
84  
-
85  
-		for (int i = 0; i < args.length; i++) {
86  
-			if ("-index".equals(args[i])) {
87  
-				index = args[i + 1];
88  
-				i++;
89  
-			} else if ("-field".equals(args[i])) {
90  
-				field = args[i + 1];
91  
-				i++;
92  
-			} else if ("-queries".equals(args[i])) {
93  
-				queries = args[i + 1];
94  
-				i++;
95  
-			} else if ("-query".equals(args[i])) {
96  
-				queryString = args[i + 1];
97  
-				i++;
98  
-			} else if ("-repeat".equals(args[i])) {
99  
-				repeat = Integer.parseInt(args[i + 1]);
100  
-				i++;
101  
-			} else if ("-raw".equals(args[i])) {
102  
-				raw = true;
103  
-			} else if ("-exact".equals(args[i])) {
104  
-				exact = true;
105  
-			} else if ("-paging".equals(args[i])) {
106  
-				hitsPerPage = Integer.parseInt(args[i + 1]);
107  
-				if (hitsPerPage <= 0) {
108  
-					System.err
109  
-							.println("There must be at least 1 hit per page.");
110  
-					System.exit(1);
111  
-				}
112  
-				i++;
113  
-			}
114  
-		}
115  
-
116  
-		IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File(
117  
-				index)));
118  
-		System.err.println("Index contains " + searcher.maxDoc()
119  
-				+ " documents.");
120  
-		Analyzer analyzer = RevDiffSearchUtils.getAnalyzer();
121  
-
122  
-		BufferedReader in = null;
123  
-		if (queries != null) {
124  
-			in = new BufferedReader(new InputStreamReader(new FileInputStream(
125  
-					queries), "UTF-8"));
126  
-		} else {
127  
-			in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
128  
-		}
129  
-		QueryParser parser = new QueryParser(Version.LUCENE_35, field, analyzer);
130  
-		
131  
-		while (true) {
132  
-			if (queries == null && queryString == null) { // prompt the user
133  
-				System.out.println("Enter query: ");
134  
-			}
135  
-
136  
-			String line = queryString != null ? queryString : in.readLine();
137  
-
138  
-			if (line == null || line.length() == -1) {
139  
-				break;
140  
-			}
141  
-
142  
-			line = line.trim();
143  
-			if (line.length() == 0) {
144  
-				break;
145  
-			}
146  
-
147  
-			try {
148  
-				Query query = parser.parse(line);
149  
-
150  
-				System.out.println("Searching for: " + query.toString(field));
151  
-
152  
-				if (repeat > 0) { // repeat & time as benchmark
153  
-					Date start = new Date();
154  
-					for (int i = 0; i < repeat; i++) {
155  
-						searcher.search(query, null, 100);
156  
-					}
157  
-					Date end = new Date();
158  
-					System.out.println("Time: "
159  
-							+ (end.getTime() - start.getTime()) + "ms");
160  
-				}
161  
-
162  
-				doPagingSearch(in, searcher, query, hitsPerPage, raw, exact,
163  
-                       queryString == null? line: queryString,
164  
-                       queries == null && queryString == null);
165  
-
166  
-			} catch (ParseException e) {
167  
-				System.out
168  
-						.println("Error parsing the query, please refine your query.");
169  
-			}
170  
-
171  
-			if (queryString != null) {
172  
-				break;
173  
-			}
174  
-		}
175  
-		searcher.close();
176  
-	}
177  
-
178  
-	public static void setMAX_HITS(int max_hits) {
179  
-		MAX_HITS = max_hits;
180  
-	}
181  
-
182  
-	public static int getMAX_HITS() {
183  
-		return MAX_HITS;
184  
-	}
185  
-
186  
-	/**
187  
-	 * This demonstrates a typical paging search scenario, where the search
188  
-	 * engine presents pages of size n to the user. The user can then go to the
189  
-	 * next page if interested in the next hits.
190  
-	 * 
191  
-	 * When the query is executed for the first time, then only enough results
192  
-	 * are collected to fill 5 result pages. If the user wants to page beyond
193  
-	 * this limit, then the query is executed another time and all hits are
194  
-	 * collected.
195  
-	 * 
196  
-	 */
197  
-	private static void printResultHeading(List<Fieldable> fields) {
198  
-		if (fields != null) {
199  
-			System.out.print("#\tScore\t");
200  
-			Iterator<Fieldable> it = fields.iterator();
201  
-			while (it.hasNext()) {
202  
-				Fieldable value = it.next();
203  
-				System.out.print(value.name() + "\t");
204  
-			}
205  
-			System.out.println();
206  
-		}
207  
-	}
208  
-
209  
-
210  
-	public static void writeResults(String queryStr, IndexReader reader,
211  
-                                  int hits) {
212  
-		String sFileName = FileUtils.createFilename(queryStr);
213  
-    int max_hits = getMAX_HITS();
214  
-		CSVWriter writer = null;
215  
-		try {
216  
-			writer = new CSVWriter(new FileWriter(sFileName), '\t');
217  
-      Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);
218  
-      writer.writeNext(fieldNames.toArray(new String[fieldNames.size()]));
219  
-			for (int i = 0; i < hits && i <= max_hits; i++) {
220  
-        Document doc = reader.document(i);
221  
-        List<String> vals = new ArrayList<String>();
222  
-        for ( String name: fieldNames ) {
223  
-					vals.add(StringEscapeUtils
224  
-                   .escapeJava(doc.getFieldable(name).stringValue()));
225  
-				}
226  
-        writer.writeNext(vals.toArray(new String[vals.size()]));
227  
-			}
228  
-			writer.flush();
229  
-		} catch (IOException e) {
230  
-			System.out.println("Cannot write to file");
231  
-			e.printStackTrace();
232  
-		} finally {
233  
-      try {
234  
-        if ( writer != null )
235  
-          writer.close();
236  
-			} catch (IOException e) {
237  
-				System.out.println("Cannot close writer");
238  
-				e.printStackTrace();
239  
-			}
240  
-		}
241  
-
242  
-	}
243  
-
244  
-	public static void printCommandline(int start, int hitsPerPage,
245  
-			int numTotalHits) {
246  
-		System.out.print("Press ");
247  
-		if (start - hitsPerPage >= 0) {
248  
-			System.out.print("(p)revious page, ");
249  
-		}
250  
-		if (start + hitsPerPage < numTotalHits) {
251  
-			System.out.print("(n)ext page, (w)rite to file, ");
252  
-		}
253  
-		System.out.println("(q)uit or enter number to jump to a page.");
254  
-	}
255  
-
256  
-	public static void doPagingSearch(BufferedReader in,
257  
-                                    IndexSearcher searcher, Query query, int hitsPerPage, boolean raw, boolean exact,
258  
-                                    String queryStr, boolean interactive) throws IOException {
259  
-		// Collect enough docs to show 5 pages
260  
-		TopDocs results = searcher.search(query, 5 * hitsPerPage);
261  
-		ScoreDoc[] hits = results.scoreDocs;
262  
-
263  
-		int numTotalHits = results.totalHits;
264  
-		System.out.println(numTotalHits + " total matching documents");
265  
-
266  
-		int start = 0;
267  
-		int end = Math.min(numTotalHits, hitsPerPage);
268  
-		boolean quit = false;
269  
-
270  
-		while (true) {
271  
-			if (end > hits.length) {
272  
-				System.out
273  
-						.println("Only results 1 - " + hits.length + " of "
274  
-								+ numTotalHits
275  
-								+ " total matching documents collected.");
276  
-				System.out.println("Collect more (y/n) ?");
277  
-				String line = in.readLine();
278  
-				if (line.length() == 0 || line.charAt(0) == 'n') {
279  
-					break;
280  
-				}
281  
-
282  
-				hits = searcher.search(query, numTotalHits).scoreDocs;
283  
-			}
284  
-
285  
-			end = Math.min(hits.length, start + hitsPerPage);
286  
-
287  
-			for (int i = start; i < end; i++) {
288  
-
289  
-        if ( exact ) {
290  
-          // check if the document really contains the query string
291  
-          // FIXME: doesn't work for any complex ("field:value" type) query
292  
-          if (searcher.doc(hits[i].doc).getField("added").stringValue().indexOf(queryStr) < 0 ) {
293  
-            continue;
294  
-          }
295  
-        }
296  
-
297  
-				if (raw) { // output raw format
298  
-					System.out.println("doc=" + hits[i].doc + " score="
299  
-							+ hits[i].score);
300  
-					continue;
301  
-				}
302  
-
303  
-				Document doc = searcher.doc(hits[i].doc);
304  
-				String key = doc.get(searchKey);
305  
-				List<Fieldable> fields = doc.getFields();
306  
-				Iterator<Fieldable> it = fields.iterator();
307  
-				if (key != null) {
308  
-					if (i == 0) {
309  
-						printResultHeading(fields);
310  
-					}
311  
-					System.out.print((i + 1) + ".\t" + hits[i].score + "\t");
312  
-					while (it.hasNext()) {
313  
-						Fieldable field = it.next();
314  
-						System.out.print(StringEscapeUtils
315  
-								.escapeJava(truncateString(field.stringValue(),
316  
-										100))
317  
-								+ "\t");
318  
-					}
319  
-					// System.out.println((i + 1) + ". " + key);
320  
-					System.out.println();
321  
-				} else {
322  
-					System.out.println((i + 1) + ". " + "No key (" + searchKey
323  
-							+ ") for this document");
324  
-				}
325  
-
326  
-			}
327  
-
328  
-			if (!interactive || end == 0) {
329  
-				break;
330  
-			}
331  
-
332  
-			if (numTotalHits >= end) {
333  
-				while (true) {
334  
-
335  
-					printCommandline(start, hitsPerPage, numTotalHits);
336  
-
337  
-					String line = in.readLine();
338  
-					if (line.length() == 0 || line.charAt(0) == 'x') {
339  
-						quit = true;
340  
-						break;
341  
-					}
342  
-					if (line.charAt(0) == 'p') {
343  
-						start = Math.max(0, start - hitsPerPage);
344  
-						break;
345  
-					} else if (line.charAt(0) == 'q') {
346  
-						break;
347  
-					} else if (line.charAt(0) == 'n') {
348  
-						if (start + hitsPerPage < numTotalHits) {
349  
-							start += hitsPerPage;
350  
-						}
351  
-						break;
352  
-					} else if (line.charAt(0) == 'w') {
353  
-						writeResults(query.toString(), searcher.getIndexReader(), results.totalHits);
354  
-
355  
-					} else {
356  
-						try {
357  
-							int page = Integer.parseInt(line);
358  
-							if ((page - 1) * hitsPerPage < numTotalHits) {
359  
-								start = (page - 1) * hitsPerPage;
360  
-								break;
361  
-							}
362  
-						} catch (java.lang.NumberFormatException e) {
363  
-							System.out.println("No such page");
364  
-						}
365  
-					}
366  
-				}
367  
-				end = Math.min(numTotalHits, start + hitsPerPage);
368  
-			}
369  
-			if (quit) {
370  
-				System.exit(0);
371  
-			}
372  
-		}
373  
-	}
374  
-}
375  
-/*
376  
- * Local variables: tab-width: 2 c-basic-offset: 2 indent-tabs-mode: t End:
377  
- */
141  src/main/java/org/wikimedia/revdiffsearch/analysis/Dataset.java
... ...
@@ -1,141 +0,0 @@
1  
-package org.wikimedia.revdiffsearch.analysis;
2  
-
3  
-import java.io.FileWriter;
4  
-import java.io.IOException;
5  
-import java.text.ParseException;
6  
-import java.text.SimpleDateFormat;
7  
-import java.util.Calendar;
8  
-import java.util.Date;
9  
-import java.util.HashMap;
10  
-import java.util.Set;
11  
-import java.util.TimeZone;
12  
-
13  
-import org.apache.commons.lang3.mutable.MutableInt;
14  
-import org.wikimedia.revdiffsearch.utils.FileUtils;
15  
-
16  
-public class Dataset {
17  
-	/*
18  
-	 * The key of the hashmap is the year and the second key is the month. 
19  
-	 * 
20  
-	 * 
21  
-	 */
22  
-	static String name;
23  
-	static String pattern = "yyyy-MM-dd'T'HH:mm:ss";
24  
-	static Calendar calendar;
25  
-	static HashMap<Integer, HashMap<Integer, MutableInt>> container = new HashMap<Integer, HashMap<Integer, MutableInt>>();
26  
-
27  
-	public Dataset(String dataset_name) {
28  
-		name= dataset_name; 
29  
-		calendar = getCalendar();
30  
-	}
31  
-
32  
-	public boolean containsYear(int year) {
33  
-		if (container.containsKey(year)) {
34  
-			return true;
35  
-		}
36  
-		return false;
37  
-	}
38  
-	
39  
-	public boolean containsMonth(int year, int month) {
40  
-		HashMap<Integer, MutableInt> result = container.get(year);
41  
-		if (result.containsKey(month)) {
42  
-			return true;
43  
-		}
44  
-		return false;
45  
-	}
46  
-	
47  
-	public static Date convertTimestamptoDate(String timestamp) {
48  
-		Date date = null;
49  
-		
50  
-		SimpleDateFormat sdf = new SimpleDateFormat(pattern);
51  
-		try {
52  
-			date = sdf.parse(timestamp);
53  
-			System.out.println(date.toString());
54  
-		} catch (ParseException e) {
55  
-			System.out.println("Could not parse " + timestamp.toString());
56  
-		}
57  
-		return date;
58  
-	}
59  
-
60  
-	private static Calendar getCalendar() {
61  
-		TimeZone tz = TimeZone.getTimeZone("UTC");
62  
-		Calendar calendar = Calendar.getInstance(tz);
63  
-		return calendar;
64  
-	}
65  
-	
66  
-	
67  
-	private static int getComponentFromDate(Date date, String key) {
68  
-		int result = 0;
69  
-		calendar.setTime(date);
70  
-		if (key == "year") {
71  
-			result = calendar.get(Calendar.YEAR);
72  
-		} else if (key=="month"){ 
73  
-			//ARRGGHHH.. January is 0, so we need to add +1
74  
-			result = calendar.get(Calendar.MONTH)+1;
75  
-		} else if (key=="day"){
76  
-			result = calendar.get(Calendar.DAY_OF_MONTH);
77  
-		} else {
78  
-			System.out.println(key.toString() + " is an invalid key, please choose from year, month or day");
79  
-		}
80  
-		return result;
81  
-	}
82  
-	
83  
-//	private static int getMonthFromDate(Date date) {
84  
-//		int result = -1;
85  
-//		if (date != null) {
86  
-//			//Calendar calendar = getCalendar();
87  
-//			calendar.setTime(date);
88  
-//			result = calendar.get(Calendar.MONTH);
89  
-//		}
90  
-//		return result;
91  
-//	}	
92  
-//	
93  
-//	private static int getYearFromDate(Date date) {
94  
-//		int result = -1;
95  
-//		if (date != null) {
96  
-//			//Calendar calendar = getCalendar();
97  
-//			calendar.setTime(date);
98  
-//			result = calendar.get(Calendar.YEAR);
99  
-//		}
100  
-//		return result;
101  
-//	}
102  
-
103  
-	public void addDate(String timestamp) {
104  
-		Date date = convertTimestamptoDate(timestamp);
105  
-		int year = getComponentFromDate(date, "year");
106  
-		int month = getComponentFromDate(date, "month");
107  
-		if (!container.containsKey(year)) {
108  
-			container.put(year, new HashMap<Integer, MutableInt>());
109  
-		}
110  
-		
111  
-		if (!container.get(year).containsKey(month)) {
112  
-			MutableInt count = new MutableInt();
113  
-			container.get(year).put(month, count);
114  
-		}
115  
-		
116  
-		incrementObs(year, month);
117  
-	}
118  
-	
119  
-	private void incrementObs(int year, int month) {
120  
-		MutableInt count = container.get(year).get(month);
121  
-		count.increment();
122  
-		container.get(year).put(month, count);
123  
-	}
124  
-	
125  
-	public static void writeDataset() throws IOException {
126  
-		String filename = FileUtils.createFilename(name);
127  
-		FileWriter fstream = new FileWriter(filename);
128  
-		
129  
-		Set<Integer> years = container.keySet();
130  
-		for (int year: years) {
131  
-			Set<Integer> months = container.get(year).keySet();
132  
-			for (int month: months) {
133  
-				MutableInt count = container.get(year).get(month);
134  
-				String row = String.format("%s\t%s\t%s\n", year, month, count.toString());
135  
-				fstream.write(row);
136  
-			}
137  
-		}
138  
-		
139  
-		fstream.close();
140  
-	}
141  
-}
39  src/test/java/org/wikimedia/revdiffsearch/analysis/TestDataset.java
... ...
@@ -1,39 +0,0 @@
1  
-package org.wikimedia.revdiffsearch.analysis;
2  
-
3  
-import static org.junit.Assert.*;
4  
-
5  
-import java.util.Date;
6  
-
7  
-import org.junit.Before;
8  
-import org.junit.Test;
9  
-
10  
-public class TestDataset {
11  
-	
12  
-	private Dataset dataset;
13  
-	
14  
-	@Before
15  
-	public void setUp() throws Exception {
16  
-		dataset = new Dataset("foo");
17  
-	}
18  
-
19  
-	@Test
20  
-	public void testConvertTimestamptoDate() {
21  
-		String timestamp = "2009-10-19T10:47:34Z";
22  
-		Date date = Dataset.convertTimestamptoDate(timestamp);
23  
-		assertEquals(Date.class, date.getClass());
24  
-	}
25  
-
26  
-	@Test
27  
-	public void testAddDate() {
28  
-		String timestamp = "2009-10-19T10:47:34Z";
29  
-		dataset.addDate(timestamp);
30  
-		assertEquals("Result", true, dataset.containsYear(2009));
31  
-		assertEquals("Result", true, dataset.containsMonth(2009, 10));
32  
-	}
33  
-
34  
-	@Test
35  
-	public void testWriteDataset() {
36  
-		System.err.println("Not yet implemented");
37  
-	}
38  
-
39  
-}
3  template_query.py
@@ -18,7 +18,6 @@
18 18
 import copy
19 19
 import re
20 20
 import wmf
21  
-import lcp
22 21
 import query as query_func
23 22
 from wmf.dump.iterator import Iterator
24 23
 
@@ -102,8 +101,6 @@ def escape_variables(text):
102 101
                 revs = [x for x in revs]
103 102
                 for rev in revs:
104 103
                     texts.append(rev.getText().encode('utf-8'))
105  
-                # cs = lcp.common_substring(texts, len(texts) * 0.9, 20)
106  
-                # print cs
107 104
                 # missed = 0
108 105
                 for ((revPrev,timePrev), (revNext,timeNext)) in gen_prev_next([(x, x.getTimestamp()) for x in revs], (None, int(time.time()))):
109 106
 
23  web/client.py
... ...
@@ -1,23 +0,0 @@
1  
-import socket
2  
-import sys
3  
-import cPickle
4  
-
5  
-import settings
6  
-
7  
-# Create a socket (SOCK_STREAM means a TCP socket)
8  
-sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
9  
-
10  
-try:
11  
-    # Connect to server and send data
12  
-    data = '0.1'
13  
-    sock.connect((settings.HOST, settings.PORT))
14  
-    sock.send(data)
15  
-
16  
-    # Receive data from the server and shut down
17  
-    received = sock.recv(1024)
18  
-    result = cPickle.loads(received)
19  
-finally:
20  
-    sock.close()
21  
-
22  
-print "Sent:     {}".format(data)
23  
-print "Received: {}".format(result)
115  web/search.py
... ...
@@ -1,115 +0,0 @@
1  
-import os
2  
-import socket
3  
-import sys
4  
-import cPickle
5  
-import json
6  
-
7  
-import web
8  
-from web import form
9  
-
10  
-from mako.template import Template
11  
-from mako.runtime import Context
12  
-from mako.lookup import TemplateLookup
13  
-
14  
-from mimerender import mimerender
15  
-
16  
-import settings
17  
-
18  
-urls = (
19  
-'/', 'index'        
20  
-)
21  
-
22  
-
23  
-if settings.DEBUG:
24  
-    web.config.debug = True
25  
-
26  
-lookup = TemplateLookup(directories=['templates/'])
27  
-if settings.HOSTNAME == 'production':
28  
-    application = web.application(urls, globals()).wsgifunc()
29  
-else:
30  
-    app = web.application(urls, globals())
31  
-
32  
-lookup = TemplateLookup(directories=[os.path.join(os.path.dirname(__file__),'templates')])
33  
-render_json = lambda **args: json.dumps(args)
34  
-render_html = lambda message: message
35  
-
36  
-def serve_template(templatename, **kwargs):
37  
-    view = lookup.get_template(templatename)
38  
-    return view.render(**kwargs)
39  
-
40  
-
41  
-class index:
42  
-    def __init__(self, *args, **kwargs):
43  
-        self.links= {'rev_id':'w/index.php?diff=',
44  
-                     'title':'wiki/',
45  
-                     'user_text':'wiki/User:'}
46  
-        
47  
-    def searchform(self):
48  
-        search = form.Form(
49  
-            form.Textarea('query', form.notnull),
50  
-            form.Button('Search!')
51  
-        )
52  
-        return search
53  
-    
54  
-    def GET(self):
55  
-        search = self.searchform()
56  
-        return serve_template('index.html',form=search) 
57  
-    
58  
-    @mimerender(
59  
-        default= 'html',
60  
-        html = render_html,
61  
-        #json = render_json,
62  
-    )
63  
-    def POST(self):
64  
-        search = self.searchform()
65  
-        if not search.validates():
66  
-            return serve_template('index.html',form=search)
67  
-        else:
68  
-            query_str = search['query'].value
69  
-            results = self.fetch_results(query_str)
70  
-            headings = self.extract_headings(results)
71  
-            if settings.DEBUG:
72  
-                print results
73  
-            return serve_template('results.html',query_str=query_str, results=results, headings=headings, form=search)
74  
-    
75  
-    
76  
-    def extract_headings(self, results):
77  
-        return results.pop('headings')
78  
-        
79  
-        
80  
-    def fetch_results(self, query_str):
81  
-        # Create a socket (SOCK_STREAM means a TCP socket)
82  
-        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
83  
-        try:
84  
-            # Connect to server and send data
85  
-            sock.connect((settings.HOST, settings.PORT))
86  
-            sock.send(query_str)
87  
-        
88  
-            # Receive data from the server and shut down
89  
-            buffer = cStringIO.StringIO()
90  
-            buffer.write(sock.recv(4096))
91  
-            done = False
92  
-            while not done:
93  
-                more = sock.recv(4096)
94  
-                if not more:
95  
-                    done = True
96  
-                else:
97  
-                    buffer.write(more)
98  
-            #print buffer.getvalue()
99  
-            results = cPickle.loads(buffer.getvalue())
100  
-        except Exception,e:
101  
-            print e
102  
-            resuls = e
103  
-        finally:
104  
-            sock.close()
105  
-        
106  
-        return results
107  
-        
108  
-        #print "Sent:     {}".format(query_str)
109  
-        #print "Received: {}".format(result)
110  
-    
111  
-
112  
-if __name__ == '__main__':
113  
-    if app:
114  
-        app.run()
115  
-    
62  web/server.py
... ...
@@ -1,62 +0,0 @@
1  
-import SocketServer
2  
-import os
3  
-import cPickle
4  
-
5  
-import settings
6  
-from lucene import StandardAnalyzer, File, QueryParser, Version, SimpleFSDirectory, File, IndexSearcher, initVM 
7  
-
8  
-
9  
-vm = initVM()
10  
-index_dir = SimpleFSDirectory(File(settings.INDEX_DIR))
11  
-searcher = IndexSearcher(index_dir)
12  
-
13  
-class LuceneServer(SocketServer.BaseRequestHandler):
14  
-    """
15  
-    The RequestHandler class for our server.
16  
-
17  
-    It is instantiated once per connection to the server, and must
18  
-    override the handle() method to implement communication to the
19  
-    client.
20  
-    """
21  
-
22  
-    def serialize(self, hits):
23  
-        results = {}
24  
-        results['headings'] = ['score', 'contents'] #hardcoded not ideal 
25  
-        for hit in hits.scoreDocs:
26  
-            doc = searcher.doc(hit.doc)
27  
-            #print dir(doc)
28  
-            #print doc.getFields(), doc.getValues("contents")
29  
-            #print doc, doc.toString()
30  
-            doc.get('contents') #.encode("utf-8")
31  
-            results[hit.doc] = {}
32  
-            results[hit.doc]['score'] = hit.score
33  
-            results[hit.doc]['contents'] = doc.get('contents')
34  
-        return cPickle.dumps(results)
35  
-
36  
-    def handle(self):
37  
-        # self.request is the TCP socket connected to the client
38  
-        # self.rfile is a file-like object created by the handler;
39  
-        # we can now use e.g. readline() instead of raw recv() calls
40  
-        self.data = self.request.recv(1024).strip()
41  
-        #print "{} wrote:".format(self.client_address[0])
42  
-        #print self.data
43  
-        # just send back the same data, but upper-cased
44  
-        
45  
-        MAX = 50
46  
-        analyzer = StandardAnalyzer(Version.LUCENE_34)
47  
-        self.data = QueryParser.escape(self.data)
48  
-        query = QueryParser(Version.LUCENE_34, 'contents', analyzer).parse(self.data)
49  
-        
50  
-        hits = searcher.search(query, MAX)
51  
-        if settings.DEBUG:  
52  
-            print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
53  
-        serialized = self.serialize(hits)
54  
-        self.request.send(serialized)
55  
-
56  
-
57  
-if __name__ == "__main__":
58  
-    # Create the server, binding to localhost on port 9999
59  
-    server = SocketServer.TCPServer((settings.HOST, settings.PORT), LuceneServer)
60  
-    # Activate the server; this will keep running until you
61  
-    # interrupt the program with Ctrl-C
62  
-    server.serve_forever()
13  web/settings.py
... ...
@@ -1,13 +0,0 @@
1  
-import socket
2  
-import os
3  
-
4  
-hostname = socket.gethostname()
5  
-HOST, PORT = "localhost", 9999
6  
-
7  
-if hostname == 'alpha':
8  
-    hostname = 'production'
9  
-    INDEX_DIR = os.path.join('/','data-large', 'lucene', 'index')
10  
-    DEBUG=False
11  
-else:
12  
-    INDEX_DIR = os.path.join('c:\\','lucene-3.4.0','index')
13  
-    DEBUG=True
13  web/templates/base.html
... ...
@@ -1,13 +0,0 @@
1  
-# -*- coding: utf-8 -*-
2  
-<html>
3  
-  <head>
4  
-    ${self.head_tags()}
5  
-  </head>
6  
-  <body>
7  
-    ${self.body()}
8  
-  
9  
-  <div class="footer">
10  
-  	The DiffIndexer and the DiffSearcher are written by <a href="http://www.yusuke.matsubara.name/">Yusuke Matsubara</a> and <a href="http://meta.wikimedia.org/wiki/User:Drdee">Diederik van Liere</a>
11  
-  </div>
12  
-  </body>
13  
-</html>
12  web/templates/index.html
... ...
@@ -1,12 +0,0 @@
1  
-# -*- coding: utf-8 -*-
2  
-<%inherit file="base.html" />
3  
-
4  
-<%def name="head_tags()">
5  
- 	
6  
-	<title>DiffSearchers::search</title>
7  
- 	
8  
-</%def>
9  
-
10  
-<h1>DiffSearcher: the ultimate Wiki blame tool</h1>
11  
-
12  
-<%include file="searchform.html" />
40  web/templates/results.html
... ...
@@ -1,40 +0,0 @@
1  
-# -*- coding: utf-8 -*-
2  
-<%inherit file="base.html" />
3  
-
4  
-<%def name="head_tags()">
5  
- 	
6  
-	<title>DiffSearcher::results</title>
7  
- 	
8  
-</%def>
9  
-
10  
-<h1>You were searching for ${query_str}...</h1>
11  
-
12  
-% if results:
13  
-	<table>
14  
-		<tr>
15  
-		 % for heading in headings:
16  
-			<td class="even"><h3>${heading}</h3></td>
17  
-		 % endfor
18  
-		</tr>
19  
-
20  
-	% for x, hit in enumerate(results):
21  
-		% if x % 2 == 1:
22  
-			<tr class="even">
23  
-		% else:
24  
-			<tr class="odd">
25  
-		% endif
26  
-			% for heading in headings:
27  
-				% if heading in links:
28  
-					<td><a href="http://en.wikipedia.org/${links[heading]}${results[hit][heading]}">${results[hit][heading]}</a></td>
29  
-				% else:
30  
-					<td>${results[hit][heading]}</td>
31  
-				% endif
32  
-			% endfor
33  
-			</tr>
34  
-	% endfor
35  
-	</table>
36  
-% else:
37  
-<p> There were 0 results for ${query_str}.
38  
-% endif
39  
-
40  
-<%include file="searchform.html" />
59  web/templates/searchform.html
... ...
@@ -1,59 +0,0 @@
1  
-
2  
-
3  
-
4  
-	<form name="search" method="post">
5  
-		${form.render()}
6  
-	</form>
7  
-
8  
-
9  
-<h2>Constructing queries in Lucene</h2>
10  
-<p>
11  
-You have the full Lucene query power at your disposal. The easiest query is:
12  
-	<pre>
13  
-	foo
14  
-	</pre>
15  
-This will return all revisions where the word foo was either added or removed.
16  
-</p>
17  
-
18  
-<p>
19  
-You can limit your results to certain namespaces:
20  
-	<pre>
21  
-	foo namespace:3
22  
-	</pre>
23  
-This will return all revisions where the word foo was either added or removed in namespace 3.
24  
-</p>
25  
-
26  
-<p>
27  
-You can search for a word in a particular field (see Available fields and as long as Field.Index says Analyzed):
28  
-	<pre>
29  
-	title:foo namespace:4 
30  
-	</pre>
31  
-</p>
32  
-
33  
-
34  
-<h2>Available fields</h2>
35  
-<p>
36  
-<ul>
37  
-	<li>rev_id,		Field.Index.NOT_ANALYZED</li>
38  
-	<li>page_id,	Field.Index.NOT_ANALYZED</li>
39  
-	<li>namespace, 	Field.Index.NOT_ANALYZED</li>
40  
-	<li>title,    	Field.Index.ANALYZED</li>
41  
-	<li>timestamp, 	Field.Index.NOT_ANALYZED</li>
42  
-	<li>minor,     	Field.Index.NOT_ANALYZED</li>
43  
-	<li>user_id,   	Field.Index.NOT_ANALYZED</li>
44  
-	<li>user_text, 	Field.Index.ANALYZED</li>
45  
-</ul>
46  
-</p>
47  
-
48  
-
49  
-<h2>Next features</h2>
50  
-<ul>
51  
-<li> limit query to date range</li>
52  
-
53  
-</ul> 
54  
-
55  
-<h2>Limitations</h2>
56  
-<ul>
57  
-<li> You cannot search the comments at this moment. </li>
58  
-<li> You cannot limit a query to certain users</li>
59  
-</ul>

0 notes on commit 75cbd15

Please sign in to comment.
Something went wrong with that request. Please try again.