Skip to content

Commit

Permalink
Fixes loklak#1112: Add image, video filter constraints for cache
Browse files Browse the repository at this point in the history
  • Loading branch information
vibhcool committed Jun 1, 2017
1 parent 9a3db58 commit e00b308
Show file tree
Hide file tree
Showing 5 changed files with 177 additions and 63 deletions.
37 changes: 31 additions & 6 deletions src/org/loklak/api/search/SearchServlet.java
Expand Up @@ -26,6 +26,8 @@
import java.util.Date;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.ArrayList;
import java.util.Arrays;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
Expand Down Expand Up @@ -150,8 +152,9 @@ protected void doGet(final HttpServletRequest request, final HttpServletResponse
DAO.getConfig(SEARCH_MAX_LOCALHOST_COUNT_NAME, 1000) :
DAO.getConfig(SEARCH_MAX_PUBLIC_COUNT_NAME, 100)));

String filter = post.get("filter", post.get("filter", ""));

String filter = post.get("filter", post.get("filter", "")).replaceAll("\\s","");
ArrayList<String> filterList = new ArrayList<String>(Arrays.asList(filter.split(",")));

// create tweet timeline
final String ordername = post.get("order", Timeline.Order.CREATED_AT.getMessageFieldName());
final Timeline.Order order = Timeline.parseOrder(ordername);
Expand Down Expand Up @@ -188,7 +191,7 @@ protected void doGet(final HttpServletRequest request, final HttpServletResponse
public void run() {
final String scraper_query = tokens.translate4scraper();
DAO.log(request.getServletPath() + " scraping with query: " + scraper_query);
Timeline twitterTl = DAO.scrapeTwitter(post, filter, scraper_query, order, timezoneOffsetf, true, timeout, true);
Timeline twitterTl = DAO.scrapeTwitter(post, filterList, scraper_query, order, timezoneOffsetf, true, timeout, true);
count_twitter_new.set(twitterTl.size());
tl.putAll(QueryEntry.applyConstraint(twitterTl, tokens, false)); // pre-localized results are not filtered with location constraint any more
tl.setScraperInfo(twitterTl.getScraperInfo());
Expand All @@ -200,7 +203,18 @@ public void run() {
// start a local search
Thread localThread = queryf == null || queryf.length() == 0 ? null : new Thread() {
public void run() {
DAO.SearchLocalMessages localSearchResult = new DAO.SearchLocalMessages(queryf, order, timezoneOffsetf, last_cache_search_time.get() > SEARCH_CACHE_THREASHOLD_TIME ? Math.min(maximumRecords, (int) DAO.getConfig(SEARCH_LOW_COUNT_NAME, 10)) : maximumRecords, agregation_limit, fields);

DAO.SearchLocalMessages localSearchResult =
new DAO.SearchLocalMessages(
queryf,
order,
timezoneOffsetf,
last_cache_search_time.get() > SEARCH_CACHE_THREASHOLD_TIME ? Math.min(maximumRecords,
(int) DAO.getConfig(SEARCH_LOW_COUNT_NAME, 10)) : maximumRecords,
agregation_limit,
filterList,
fields
);
long time = System.currentTimeMillis() - start;
last_cache_search_time.set(time);
post.recordEvent("cache_time", time);
Expand Down Expand Up @@ -265,7 +279,7 @@ public void run() {
} else if ("twitter".equals(source) && tokens.raw.length() > 0) {
final String scraper_query = tokens.translate4scraper();
DAO.log(request.getServletPath() + " scraping with query: " + scraper_query);
Timeline twitterTl = DAO.scrapeTwitter(post, filter, scraper_query, order, timezoneOffset, true, timeout, true);
Timeline twitterTl = DAO.scrapeTwitter(post, filterList, scraper_query, order, timezoneOffset, true, timeout, true);

count_twitter_new.set(twitterTl.size());
tl.putAll(QueryEntry.applyConstraint(twitterTl, tokens, false)); // pre-localized results are not filtered with location constraint any more
Expand All @@ -274,7 +288,18 @@ public void run() {
// in this case we use all tweets, not only the latest one because it may happen that there are no new and that is not what the user expects

} else if ("cache".equals(source)) {
DAO.SearchLocalMessages localSearchResult = new DAO.SearchLocalMessages(query, order, timezoneOffset, last_cache_search_time.get() > SEARCH_CACHE_THREASHOLD_TIME ? Math.min(maximumRecords, (int) DAO.getConfig(SEARCH_LOW_COUNT_NAME, 10)) : maximumRecords, agregation_limit, fields);
DAO.SearchLocalMessages localSearchResult =
new DAO.SearchLocalMessages(
query,
order,
timezoneOffset,
last_cache_search_time.get() > SEARCH_CACHE_THREASHOLD_TIME ?
Math.min(maximumRecords,(int) DAO.getConfig(SEARCH_LOW_COUNT_NAME, 10))
: maximumRecords,
agregation_limit,
filterList,
fields
);
cache_hits.set(localSearchResult.timeline.getHits());
tl.putAll(localSearchResult.timeline);
tl.setResultIndex(localSearchResult.timeline.getResultIndex());
Expand Down
40 changes: 34 additions & 6 deletions src/org/loklak/data/DAO.java
Expand Up @@ -933,10 +933,19 @@ public static class SearchLocalMessages {
* @param resultCount - the number of messages in the result; can be zero if only aggregations are wanted
* @param aggregationLimit - the maximum count of facet entities, not search results
* @param aggregationFields - names of the aggregation fields. If no aggregation is wanted, pass no (zero) field(s)
* @param filterList - list of filters in String datatype
*/
public SearchLocalMessages(final String q, final Timeline.Order order_field, final int timezoneOffset, final int resultCount, final int aggregationLimit, final String... aggregationFields) {
public SearchLocalMessages (
final String q,
final Timeline.Order order_field,
final int timezoneOffset,
final int resultCount,
final int aggregationLimit,
final ArrayList<String> filterList,
final String... aggregationFields
) {
this.timeline = new Timeline(order_field);
QueryEntry.ElasticsearchQuery sq = new QueryEntry.ElasticsearchQuery(q, timezoneOffset);
QueryEntry.ElasticsearchQuery sq = new QueryEntry.ElasticsearchQuery(q, timezoneOffset, filterList);
long interval = sq.until.getTime() - sq.since.getTime();
IndexName resultIndex;
if (aggregationFields.length > 0 && q.contains("since:")) {
Expand Down Expand Up @@ -978,6 +987,25 @@ public SearchLocalMessages(final String q, final Timeline.Order order_field, fin
}
this.aggregations = query.aggregations;
}

public SearchLocalMessages (
final String q,
final Timeline.Order order_field,
final int timezoneOffset,
final int resultCount,
final int aggregationLimit,
final String... aggregationFields
) {
this(
q,
order_field,
timezoneOffset,
resultCount,
aggregationLimit,
new ArrayList<>(),
aggregationFields
);
}

private static boolean insufficient(ElasticsearchClient.Query query, int resultCount, int aggregationLimit, String... aggregationFields) {
return query.hitCount < resultCount || (aggregationFields.length > 0 && getAggregationResultLimit(query.aggregations) < aggregationLimit);
Expand Down Expand Up @@ -1108,12 +1136,12 @@ public static Timeline scrapeTwitter(
long timeout,
boolean recordQuery) {

return scrapeTwitter(post, "", q, order, timezoneOffset, byUserQuery, timeout, recordQuery);
return scrapeTwitter(post, new ArrayList<>(), q, order, timezoneOffset, byUserQuery, timeout, recordQuery);
}

public static Timeline scrapeTwitter(
final Query post,
final String filter,
final ArrayList<String> filterList,
final String q,
final Timeline.Order order,
final int timezoneOffset,
Expand All @@ -1133,7 +1161,7 @@ public static Timeline scrapeTwitter(
// maybe the remote server died, we try then ourself
start = System.currentTimeMillis();

tl = TwitterScraper.search(q, filter, order, true, true, 400);
tl = TwitterScraper.search(q, filterList, order, true, true, 400);
if (post != null) post.recordEvent("local_scraper_after_unsuccessful_remote", System.currentTimeMillis() - start);
} else {
tl.writeToIndex();
Expand All @@ -1142,7 +1170,7 @@ public static Timeline scrapeTwitter(
if (post != null && remote.size() > 0) post.recordEvent("omitted_scraper_latency_" + remote.get(0), peerLatency.get(remote.get(0)));
long start = System.currentTimeMillis();

tl = TwitterScraper.search(q, filter, order, true, true, 400);
tl = TwitterScraper.search(q, filterList, order, true, true, 400);
if (post != null) post.recordEvent("local_scraper", System.currentTimeMillis() - start);
}

Expand Down
95 changes: 56 additions & 39 deletions src/org/loklak/harvester/TwitterScraper.java
Expand Up @@ -76,13 +76,13 @@ public class TwitterScraper {

public static Timeline search(
final String query,
final String filter,
final ArrayList<String> filterList,
final Timeline.Order order,
final boolean writeToIndex,
final boolean writeToBackend,
int jointime) {

Timeline[] tl = search(query, filter.replaceAll("\\s",""), order, writeToIndex, writeToBackend);
Timeline[] tl = search(query, filterList, order, writeToIndex, writeToBackend);
long timeout = System.currentTimeMillis() + jointime;
for (MessageEntry me: tl[1]) {
assert me instanceof TwitterTweet;
Expand All @@ -99,10 +99,15 @@ public static Timeline search(
final boolean writeToIndex,
final boolean writeToBackend,
int jointime) {
return search(query, "", order, writeToIndex, writeToBackend, jointime);

return search(query, new ArrayList<>(), order, writeToIndex, writeToBackend, jointime);
}

<<<<<<< 9a3db585ca9f8008636c1ab9c1df01b05619c9fa
private static String prepareSearchURL(final String query, final String filter) {
=======
public static String prepareSearchURL(final String query, final ArrayList<String> filterList) {
>>>>>>> Fixes #1112: Add image, video filter constraints for cache
// check
// https://twitter.com/search-advanced for a better syntax
// build queries like https://twitter.com/search?f=tweets&vertical=default&q=kaffee&src=typd
Expand All @@ -125,8 +130,9 @@ private static String prepareSearchURL(final String query, final String filter)
String q = t.length() == 0 ? "*" : URLEncoder.encode(t.substring(1), "UTF-8");

// type of content to fetch
if(filter.equals("video"))
if(filterList.contains("video") && filterList.size() == 1) {
type = "videos";
}

// building url
https_url = "https://twitter.com/search?f="
Expand All @@ -142,27 +148,27 @@ private static Timeline[] search(
final Timeline.Order order,
final boolean writeToIndex,
final boolean writeToBackend) {
return search(query, "", order, writeToIndex, writeToBackend);
return search(query, new ArrayList<>(), order, writeToIndex, writeToBackend);
}

private static Timeline[] search(
final String query,
final String filter,
final ArrayList<String> filterList,
final Timeline.Order order,
final boolean writeToIndex,
final boolean writeToBackend) {
// check
// https://twitter.com/search-advanced for a better syntax
// https://support.twitter.com/articles/71577-how-to-use-advanced-twitter-search#
String https_url = prepareSearchURL(query, filter);
String https_url = prepareSearchURL(query, filterList);
Timeline[] timelines = null;
try {
ClientConnection connection = new ClientConnection(https_url);
if (connection.inputStream == null) return null;
try {
BufferedReader br = new BufferedReader(new InputStreamReader(connection.inputStream, StandardCharsets.UTF_8));

timelines = search(br, filter, order, writeToIndex, writeToBackend);
timelines = search(br, filterList, order, writeToIndex, writeToBackend);
} catch (IOException e) {
DAO.severe(e);
} finally {
Expand Down Expand Up @@ -191,19 +197,19 @@ private static Timeline[] parse(
final Timeline.Order order,
final boolean writeToIndex,
final boolean writeToBackend) {
return parse(file, "", order, writeToIndex, writeToBackend);
return parse(file, new ArrayList<>(), order, writeToIndex, writeToBackend);
}

private static Timeline[] parse(
final File file,
String filter,
final ArrayList<String> filterList,
final Timeline.Order order,
final boolean writeToIndex,
final boolean writeToBackend) {
Timeline[] timelines = null;
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8));
timelines = search(br, filter, order, writeToIndex, writeToBackend);
timelines = search(br, filterList, order, writeToIndex, writeToBackend);
} catch (IOException e) {
DAO.severe(e);
} finally {
Expand All @@ -223,7 +229,7 @@ private static Timeline[] search(
final boolean writeToIndex,
final boolean writeToBackend) throws IOException {

return search(br, "", order, writeToIndex, writeToBackend);
return search(br, new ArrayList<>(), order, writeToIndex, writeToBackend);
}

/**
Expand All @@ -235,21 +241,12 @@ private static Timeline[] search(
*/
private static Timeline[] search(
final BufferedReader br,
String filter,
final ArrayList<String> filterList,
final Timeline.Order order,
final boolean writeToIndex,
final boolean writeToBackend) throws IOException {
Matcher match_video1;
Matcher match_video2;
Pattern[] video_url_patterns = {
Pattern.compile("youtu.be\\/[0-9A-z]+"),
Pattern.compile("youtube.com\\/watch?v=[0-9A-z]+")
};
Timeline timelineReady = new Timeline(order);
Timeline timelineWorking = new Timeline(order);
Set<String> filter_array = new HashSet<String>(Arrays.asList(
(filter.toLowerCase()).split(",")
));
String input;
Map<String, prop> props = new HashMap<String, prop>();
Set<String> images = new LinkedHashSet<>();
Expand Down Expand Up @@ -381,21 +378,7 @@ private static Timeline[] search(

if (props.size() == 10 || (debuglog && props.size() > 4 && input.indexOf("stream-item") > 0)) {

// filter tweets with videos and others
if (filter_array.contains("video") && filter_array.size() > 1) {
match_video1 = video_url_patterns[0].matcher(props.get("tweettext").value);
match_video2 = video_url_patterns[1].matcher(props.get("tweettext").value);

if(!match_video1.find() && !match_video2.find() && videos.size() < 1) {
props = new HashMap<String, prop>();
place_id = "";
place_name = "";
continue;
}
}

// filter tweets with images
if (filter_array.contains("image") && images.size() < 1) {
if(!filterPosts(filterList, props, videos, images)) {
props = new HashMap<String, prop>();
place_id = "";
place_name = "";
Expand Down Expand Up @@ -568,6 +551,39 @@ private static String getBearerTokenFromJs(String jsUrl) throws IOException {
throw new IOException("Couldn't get BEARER_TOKEN");
}

private static boolean filterPosts(
ArrayList<String> filterList,
Map<String, prop> props,
Set<String> videos,
Set<String> images
) {
Matcher match_video1;
Matcher match_video2;
Pattern[] video_url_patterns = {
Pattern.compile("youtu.be\\/[0-9A-z]+"),
Pattern.compile("youtube.com\\/watch?v=[0-9A-z]+")
};

// filter tweets with videos and others
if (filterList.contains("video") && filterList.size() > 1) {
match_video1 = video_url_patterns[0].matcher(props.get("tweettext").value);
match_video2 = video_url_patterns[1].matcher(props.get("tweettext").value);

if(!match_video1.find() && !match_video2.find() && videos.size() < 1) {
return false;
}
}

// filter tweets with images
if (filterList.contains("image") && images.size() < 1) {
return false;
}

//TODO: Add more filters

return true;
}

private static class prop {
public String key, value = null;
public prop(String value) {
Expand Down Expand Up @@ -811,12 +827,13 @@ public static String unshorten(String text) {
*/
public static void main(String[] args) {
//wget --no-check-certificate "https://twitter.com/search?q=eifel&src=typd&f=realtime"
String filter = "image";
ArrayList<String> filterList = new ArrayList<String>();
filterList.add("image");
Timeline[] result = null;
if (args[0].startsWith("/"))
result = parse(new File(args[0]),Timeline.Order.CREATED_AT, true, true);
else
result = TwitterScraper.search(args[0], filter, Timeline.Order.CREATED_AT, true, true);
result = TwitterScraper.search(args[0], filterList, Timeline.Order.CREATED_AT, true, true);
int all = 0;
for (int x = 0; x < 2; x++) {
if (x == 0) System.out.println("Timeline[0] - finished to be used:");
Expand Down

0 comments on commit e00b308

Please sign in to comment.