From e290dcc5d32cab6f838a45e3bd02c996d337500a Mon Sep 17 00:00:00 2001 From: vibhcool Date: Thu, 25 May 2017 14:55:45 +0530 Subject: [PATCH] Fixes #1112: Add image, video filter constraints for cache --- src/org/loklak/api/search/SearchServlet.java | 36 +++++++++++-- src/org/loklak/data/DAO.java | 40 +++++++++++--- src/org/loklak/harvester/TwitterScraper.java | 40 +++++++------- src/org/loklak/objects/QueryEntry.java | 53 ++++++++++++++++--- .../api/search/GithubProfileScraperTest.java | 45 ++++++++++++++++ 5 files changed, 175 insertions(+), 39 deletions(-) create mode 100644 test/org/loklak/api/search/GithubProfileScraperTest.java diff --git a/src/org/loklak/api/search/SearchServlet.java b/src/org/loklak/api/search/SearchServlet.java index 54ee3e1d6..7213178d3 100644 --- a/src/org/loklak/api/search/SearchServlet.java +++ b/src/org/loklak/api/search/SearchServlet.java @@ -26,6 +26,8 @@ import java.util.Date; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; +import java.util.ArrayList; +import java.util.Arrays; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; @@ -151,7 +153,9 @@ protected void doGet(final HttpServletRequest request, final HttpServletResponse DAO.getConfig(SEARCH_MAX_PUBLIC_COUNT_NAME, 100))); String filter = post.get("filter", post.get("filter", "")); - + filter.replaceAll("\\s",""); + ArrayList filterList = new ArrayList(Arrays.asList(filter.split(","))); + // create tweet timeline final String ordername = post.get("order", Timeline.Order.CREATED_AT.getMessageFieldName()); final Timeline.Order order = Timeline.parseOrder(ordername); @@ -188,7 +192,7 @@ protected void doGet(final HttpServletRequest request, final HttpServletResponse public void run() { final String scraper_query = tokens.translate4scraper(); DAO.log(request.getServletPath() + " scraping with query: " + scraper_query); - Timeline twitterTl = DAO.scrapeTwitter(post, filter, scraper_query, order, timezoneOffsetf, true, timeout, true); + Timeline twitterTl = DAO.scrapeTwitter(post, filterList, scraper_query, order, timezoneOffsetf, true, timeout, true); count_twitter_new.set(twitterTl.size()); tl.putAll(QueryEntry.applyConstraint(twitterTl, tokens, false)); // pre-localized results are not filtered with location constraint any more tl.setScraperInfo(twitterTl.getScraperInfo()); @@ -200,7 +204,18 @@ public void run() { // start a local search Thread localThread = queryf == null || queryf.length() == 0 ? null : new Thread() { public void run() { - DAO.SearchLocalMessages localSearchResult = new DAO.SearchLocalMessages(queryf, order, timezoneOffsetf, last_cache_search_time.get() > SEARCH_CACHE_THREASHOLD_TIME ? Math.min(maximumRecords, (int) DAO.getConfig(SEARCH_LOW_COUNT_NAME, 10)) : maximumRecords, agregation_limit, fields); + + DAO.SearchLocalMessages localSearchResult = + new DAO.SearchLocalMessages( + queryf, + order, + timezoneOffsetf, + last_cache_search_time.get() > SEARCH_CACHE_THREASHOLD_TIME ? Math.min(maximumRecords, + (int) DAO.getConfig(SEARCH_LOW_COUNT_NAME, 10)) : maximumRecords, + agregation_limit, + filterList, + fields + ); long time = System.currentTimeMillis() - start; last_cache_search_time.set(time); post.recordEvent("cache_time", time); @@ -265,7 +280,7 @@ public void run() { } else if ("twitter".equals(source) && tokens.raw.length() > 0) { final String scraper_query = tokens.translate4scraper(); DAO.log(request.getServletPath() + " scraping with query: " + scraper_query); - Timeline twitterTl = DAO.scrapeTwitter(post, filter, scraper_query, order, timezoneOffset, true, timeout, true); + Timeline twitterTl = DAO.scrapeTwitter(post, filterList, scraper_query, order, timezoneOffset, true, timeout, true); count_twitter_new.set(twitterTl.size()); tl.putAll(QueryEntry.applyConstraint(twitterTl, tokens, false)); // pre-localized results are not filtered with location constraint any more @@ -274,7 +289,18 @@ public void run() { // in this case we use all tweets, not only the latest one because it may happen that there are no new and that is not what the user expects } else if ("cache".equals(source)) { - DAO.SearchLocalMessages localSearchResult = new DAO.SearchLocalMessages(query, order, timezoneOffset, last_cache_search_time.get() > SEARCH_CACHE_THREASHOLD_TIME ? Math.min(maximumRecords, (int) DAO.getConfig(SEARCH_LOW_COUNT_NAME, 10)) : maximumRecords, agregation_limit, fields); + DAO.SearchLocalMessages localSearchResult = + new DAO.SearchLocalMessages( + query, + order, + timezoneOffset, + last_cache_search_time.get() > SEARCH_CACHE_THREASHOLD_TIME ? + Math.min(maximumRecords,(int) DAO.getConfig(SEARCH_LOW_COUNT_NAME, 10)) + : maximumRecords, + agregation_limit, + filterList, + fields + ); cache_hits.set(localSearchResult.timeline.getHits()); tl.putAll(localSearchResult.timeline); tl.setResultIndex(localSearchResult.timeline.getResultIndex()); diff --git a/src/org/loklak/data/DAO.java b/src/org/loklak/data/DAO.java index 4c95ac03d..1cdf77907 100644 --- a/src/org/loklak/data/DAO.java +++ b/src/org/loklak/data/DAO.java @@ -933,10 +933,19 @@ public static class SearchLocalMessages { * @param resultCount - the number of messages in the result; can be zero if only aggregations are wanted * @param aggregationLimit - the maximum count of facet entities, not search results * @param aggregationFields - names of the aggregation fields. If no aggregation is wanted, pass no (zero) field(s) + * @param filterList - list of filters in String datatype */ - public SearchLocalMessages(final String q, final Timeline.Order order_field, final int timezoneOffset, final int resultCount, final int aggregationLimit, final String... aggregationFields) { + public SearchLocalMessages ( + final String q, + final Timeline.Order order_field, + final int timezoneOffset, + final int resultCount, + final int aggregationLimit, + final ArrayList filterList, + final String... aggregationFields + ) { this.timeline = new Timeline(order_field); - QueryEntry.ElasticsearchQuery sq = new QueryEntry.ElasticsearchQuery(q, timezoneOffset); + QueryEntry.ElasticsearchQuery sq = new QueryEntry.ElasticsearchQuery(q, timezoneOffset, filterList); long interval = sq.until.getTime() - sq.since.getTime(); IndexName resultIndex; if (aggregationFields.length > 0 && q.contains("since:")) { @@ -978,6 +987,25 @@ public SearchLocalMessages(final String q, final Timeline.Order order_field, fin } this.aggregations = query.aggregations; } + + public SearchLocalMessages ( + final String q, + final Timeline.Order order_field, + final int timezoneOffset, + final int resultCount, + final int aggregationLimit, + final String... aggregationFields + ) { + this( + q, + order_field, + timezoneOffset, + resultCount, + aggregationLimit, + new ArrayList<>(), + aggregationFields + ); + } private static boolean insufficient(ElasticsearchClient.Query query, int resultCount, int aggregationLimit, String... aggregationFields) { return query.hitCount < resultCount || (aggregationFields.length > 0 && getAggregationResultLimit(query.aggregations) < aggregationLimit); @@ -1108,12 +1136,12 @@ public static Timeline scrapeTwitter( long timeout, boolean recordQuery) { - return scrapeTwitter(post, "", q, order, timezoneOffset, byUserQuery, timeout, recordQuery); + return scrapeTwitter(post, new ArrayList<>(), q, order, timezoneOffset, byUserQuery, timeout, recordQuery); } public static Timeline scrapeTwitter( final Query post, - final String filter, + final ArrayList filterList, final String q, final Timeline.Order order, final int timezoneOffset, @@ -1133,7 +1161,7 @@ public static Timeline scrapeTwitter( // maybe the remote server died, we try then ourself start = System.currentTimeMillis(); - tl = TwitterScraper.search(q, filter, order, true, true, 400); + tl = TwitterScraper.search(q, filterList, order, true, true, 400); if (post != null) post.recordEvent("local_scraper_after_unsuccessful_remote", System.currentTimeMillis() - start); } else { tl.writeToIndex(); @@ -1142,7 +1170,7 @@ public static Timeline scrapeTwitter( if (post != null && remote.size() > 0) post.recordEvent("omitted_scraper_latency_" + remote.get(0), peerLatency.get(remote.get(0))); long start = System.currentTimeMillis(); - tl = TwitterScraper.search(q, filter, order, true, true, 400); + tl = TwitterScraper.search(q, filterList, order, true, true, 400); if (post != null) post.recordEvent("local_scraper", System.currentTimeMillis() - start); } diff --git a/src/org/loklak/harvester/TwitterScraper.java b/src/org/loklak/harvester/TwitterScraper.java index 4b117fe3e..69ddc081d 100644 --- a/src/org/loklak/harvester/TwitterScraper.java +++ b/src/org/loklak/harvester/TwitterScraper.java @@ -61,13 +61,13 @@ public class TwitterScraper { public static Timeline search( final String query, - final String filter, + final ArrayList filterList, final Timeline.Order order, final boolean writeToIndex, final boolean writeToBackend, int jointime) { - Timeline[] tl = search(query, filter.replaceAll("\\s",""), order, writeToIndex, writeToBackend); + Timeline[] tl = search(query, filterList, order, writeToIndex, writeToBackend); long timeout = System.currentTimeMillis() + jointime; for (MessageEntry me: tl[1]) { assert me instanceof TwitterTweet; @@ -84,10 +84,10 @@ public static Timeline search( final boolean writeToIndex, final boolean writeToBackend, int jointime) { - return search(query, "", order, writeToIndex, writeToBackend, jointime); + return search(query, new ArrayList<>(), order, writeToIndex, writeToBackend, jointime); } - public static String prepareSearchURL(final String query, final String filter) { + public static String prepareSearchURL(final String query, final ArrayList filterList) { // check // https://twitter.com/search-advanced for a better syntax // build queries like https://twitter.com/search?f=tweets&vertical=default&q=kaffee&src=typd @@ -110,7 +110,7 @@ public static String prepareSearchURL(final String query, final String filter) { String q = t.length() == 0 ? "*" : URLEncoder.encode(t.substring(1), "UTF-8"); // type of content to fetch - if(filter.equals("video")) + if(filterList.contains("video") && filterList.size() == 1) type = "videos"; // building url @@ -126,19 +126,19 @@ public static Timeline[] search( final Timeline.Order order, final boolean writeToIndex, final boolean writeToBackend) { - return search(query, "", order, writeToIndex, writeToBackend); + return search(query, new ArrayList<>(), order, writeToIndex, writeToBackend); } public static Timeline[] search( final String query, - final String filter, + final ArrayList filterList, final Timeline.Order order, final boolean writeToIndex, final boolean writeToBackend) { // check // https://twitter.com/search-advanced for a better syntax // https://support.twitter.com/articles/71577-how-to-use-advanced-twitter-search# - String https_url = prepareSearchURL(query, filter); + String https_url = prepareSearchURL(query, filterList); Timeline[] timelines = null; try { ClientConnection connection = new ClientConnection(https_url); @@ -146,7 +146,7 @@ public static Timeline[] search( try { BufferedReader br = new BufferedReader(new InputStreamReader(connection.inputStream, StandardCharsets.UTF_8)); - timelines = search(br, filter, order, writeToIndex, writeToBackend); + timelines = search(br, filterList, order, writeToIndex, writeToBackend); } catch (IOException e) { DAO.severe(e); } finally { @@ -175,19 +175,19 @@ public static Timeline[] parse( final Timeline.Order order, final boolean writeToIndex, final boolean writeToBackend) { - return parse(file, "", order, writeToIndex, writeToBackend); + return parse(file, new ArrayList<>(), order, writeToIndex, writeToBackend); } public static Timeline[] parse( final File file, - String filter, + final ArrayList filterList, final Timeline.Order order, final boolean writeToIndex, final boolean writeToBackend) { Timeline[] timelines = null; try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8)); - timelines = search(br, filter, order, writeToIndex, writeToBackend); + timelines = search(br, filterList, order, writeToIndex, writeToBackend); } catch (IOException e) { DAO.severe(e); } finally { @@ -207,7 +207,7 @@ public static Timeline[] search( final boolean writeToIndex, final boolean writeToBackend) throws IOException { - return search(br, "", order, writeToIndex, writeToBackend); + return search(br, new ArrayList<>(), order, writeToIndex, writeToBackend); } /** @@ -219,7 +219,7 @@ public static Timeline[] search( */ public static Timeline[] search( final BufferedReader br, - String filter, + final ArrayList filterList, final Timeline.Order order, final boolean writeToIndex, final boolean writeToBackend) throws IOException { @@ -231,9 +231,6 @@ public static Timeline[] search( }; Timeline timelineReady = new Timeline(order); Timeline timelineWorking = new Timeline(order); - Set filter_array = new HashSet(Arrays.asList( - (filter.toLowerCase()).split(",") - )); String input; Map props = new HashMap(); Set images = new LinkedHashSet<>(); @@ -361,7 +358,7 @@ public static Timeline[] search( if (props.size() == 10 || (debuglog && props.size() > 4 && input.indexOf("stream-item") > 0)) { // filter tweets with videos and others - if (filter_array.contains("video") && filter_array.size() > 1) { + if (filterList.contains("video") && filterList.size() > 1) { match_video1 = video_url_patterns[0].matcher(props.get("tweettext").value); match_video2 = video_url_patterns[1].matcher(props.get("tweettext").value); @@ -374,7 +371,7 @@ public static Timeline[] search( } // filter tweets with images - if (filter_array.contains("image") && images.size() < 1) { + if (filterList.contains("image") && images.size() < 1) { props = new HashMap(); place_id = ""; place_name = ""; @@ -698,12 +695,13 @@ public static String unshorten(String text) { */ public static void main(String[] args) { //wget --no-check-certificate "https://twitter.com/search?q=eifel&src=typd&f=realtime" - String filter = "image"; + ArrayList filterList = new ArrayList(); + filterList.add("image"); Timeline[] result = null; if (args[0].startsWith("/")) result = parse(new File(args[0]),Timeline.Order.CREATED_AT, true, true); else - result = TwitterScraper.search(args[0], filter, Timeline.Order.CREATED_AT, true, true); + result = TwitterScraper.search(args[0], filterList, Timeline.Order.CREATED_AT, true, true); int all = 0; for (int x = 0; x < 2; x++) { if (x == 0) System.out.println("Timeline[0] - finished to be used:"); diff --git a/src/org/loklak/objects/QueryEntry.java b/src/org/loklak/objects/QueryEntry.java index af5b72fe2..99b98b2e8 100644 --- a/src/org/loklak/objects/QueryEntry.java +++ b/src/org/loklak/objects/QueryEntry.java @@ -495,33 +495,45 @@ public static class ElasticsearchQuery { public Date since; public Date until; - public ElasticsearchQuery(String q, int timezoneOffset) { + public ElasticsearchQuery(String q, int timezoneOffset, ArrayList filterList) { // default values for since and util this.since = new Date(0); this.until = new Date(Long.MAX_VALUE); + // parse the query - this.queryBuilder = preparse(q, timezoneOffset); + this.queryBuilder = preparse(q, timezoneOffset, filterList); } - private QueryBuilder preparse(String q, int timezoneOffset) { + public ElasticsearchQuery(String q, int timezoneOffset) { + this(q, timezoneOffset, new ArrayList<>()); + } + + private QueryBuilder preparse(String q, int timezoneOffset, ArrayList filterList) { // detect usage of OR connector usage. q = QueryEntry.fixQueryMistakes(q); List terms = splitIntoORGroups(q); // OR binds stronger than AND if (terms.size() == 0) return QueryBuilders.constantScoreQuery(QueryBuilders.matchAllQuery()); - // special handling - if (terms.size() == 1) return parse(terms.get(0), timezoneOffset); + if (terms.size() == 1) return parse(terms.get(0), timezoneOffset, filterList); // generic handling BoolQueryBuilder aquery = QueryBuilders.boolQuery(); for (String t: terms) { - QueryBuilder partial = parse(t, timezoneOffset); + QueryBuilder partial = parse(t, timezoneOffset, filterList); aquery.filter(partial); } return aquery; } + + private QueryBuilder preparse(String q, int timezoneOffset) { + return preparse(q, timezoneOffset, new ArrayList<>()); + } - private QueryBuilder parse(String q, int timezoneOffset) { + private QueryBuilder parse ( + String q, + int timezoneOffset, + ArrayList filterList + ) { // detect usage of OR ORconnective usage. Because of the preparse step we will have only OR or only AND here. q = q.replaceAll(" AND ", " "); // AND is default boolean ORconnective = q.indexOf(" OR ") >= 0; @@ -777,6 +789,24 @@ else if (ops.size() == 0 && nops.size() == 1) filters.add(QueryBuilders.constantScoreQuery(QueryBuilders.termsQuery("place_context", (constraint_about ? PlaceContext.ABOUT : PlaceContext.FROM).name()))); } + if (filterList.size() > 0) { + + for (String filter: filterList) { + switch(filter) { + case "image": + case "video": + // filter result if images_count (or video_count) is 0 + filters.add(QueryBuilders.boolQuery().mustNot( + QueryBuilders.constantScoreQuery(QueryBuilders.termQuery(filter + "s_count", "0")) + )); + break; + // TODO: Add more filters here + + default: + break; + } + } + } // special treatment of location constraints of the form /location=lon-west,lat-south,lon-east,lat-north i.e. /location=8.58,50.178,8.59,50.181 // source_type constraint of the form /source_type=FOSSASIA_API -> search exact term (source_type must exists in SourceType enum) for (String cs: constraints_positive) { @@ -835,7 +865,16 @@ else if (coord.length == 4 || coord.length == 5) { QueryBuilder cquery = filters.size() == 0 ? bquery : QueryBuilders.boolQuery().filter(bquery).filter(queryFilter); return cquery; } + + private QueryBuilder parse (String q, int timezoneOffset) { + return parse(q, timezoneOffset, new ArrayList<>()); + } + + } + + + public static enum PlaceContext { diff --git a/test/org/loklak/api/search/GithubProfileScraperTest.java b/test/org/loklak/api/search/GithubProfileScraperTest.java new file mode 100644 index 000000000..c4df62b6d --- /dev/null +++ b/test/org/loklak/api/search/GithubProfileScraperTest.java @@ -0,0 +1,45 @@ +package org.loklak.api.search; + +import org.junit.Test; +import static org.junit.Assert.assertEquals; +import org.json.JSONObject; +import org.loklak.api.search.GithubProfileScraper; +import org.loklak.susi.SusiThought; + +public class GithubProfileScraperTest { + @Test + public void githubProfileScraperOrgTest() { + String profile = "fossasia"; + String shortDescription = "Open Technologies in Asia"; + String userName = "fossasia"; + String userId = "6295529"; + String location = "Singapore"; + String specialLink = "http://fossasia.org"; + + SusiThought response = GithubProfileScraper.scrapeGithub(profile); + JSONObject fetchedProfile = (JSONObject)response.getData().get(0); + + assertEquals(fetchedProfile.getString("short_description"), shortDescription); + assertEquals(fetchedProfile.getString("user_name"), userName); + assertEquals(fetchedProfile.getString("user_id"), userId); + assertEquals(fetchedProfile.getString("location"), location); + assertEquals(fetchedProfile.getString("special_link"), specialLink); + } + + @Test + public void githubProfileScraperUserTest() { + String profile = "djmgit"; + String userName = "djmgit"; + String fullName = "Deepjyoti Mondal"; + String specialLink = "http://djmgit.github.io"; + String userId = "16368427"; + + SusiThought response = GithubProfileScraper.scrapeGithub(profile); + JSONObject fetchedProfile = (JSONObject)response.getData().get(0); + + assertEquals(fetchedProfile.getString("user_name"), userName); + assertEquals(fetchedProfile.getString("full_name"), fullName); + assertEquals(fetchedProfile.getString("special_link"), specialLink); + assertEquals(fetchedProfile.getString("user_id"), userId); + } +} \ No newline at end of file