Skip to content

Commit

Permalink
core: use the ua_classifier supplied is_crawler flag.
Browse files Browse the repository at this point in the history
Fallback to the hardcoded list for unknown user agents (example: curl).
Fixes #1340

(cherry picked from commit 0682a67)
  • Loading branch information
mworrell committed Jul 5, 2016
1 parent d4608b6 commit 89ac476
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 20 deletions.
2 changes: 1 addition & 1 deletion doc/ref/models/model_req.rst
Expand Up @@ -36,7 +36,7 @@ This will show something like::
{qs,[{"foo","bar"}]},
{referrer,"http://test.dev:8000/"},
{user_agent,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/601.4.4 (KHTML, like Gecko) Version/9.0.3 Safari/601.4.4"},
{is_bot, false},
{is_crawler,false},
{req_id,525158920},
{headers,[{"accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"},
Expand Down
5 changes: 3 additions & 2 deletions src/models/m_req.erl
Expand Up @@ -60,6 +60,7 @@ get(ua_class, #context{} = Context) -> z_user_agent:get_class(Context);
get(ua_props, #context{} = Context) -> z_user_agent:get_props(Context);
get(timezone, #context{} = Context) -> z_context:tz(Context);
get(language, #context{} = Context) -> z_context:language(Context);
get(is_crawler, #context{} = Context) -> z_user_agent:is_crawler(Context);
get(What, #context{} = Context) -> get_req(What, z_context:get_reqdata(Context));
get(What, #wm_reqdata{} = RD) -> get_req(What, RD).

Expand All @@ -77,14 +78,14 @@ get_req(user_agent, RD) -> wrq:get_req_header_lc("user-agent", RD);
get_req(referer, RD) -> wrq:get_req_header_lc("referer", RD);
get_req(referrer, RD) -> wrq:get_req_header_lc("referer", RD);
get_req(req_id, #wm_reqdata{log_data=#wm_log_data{req_id=ReqId}}) -> ReqId;
get_req(is_bot, RD) -> z_user_agent:is_bot(get_req(user_agent, RD));
get_req(is_crawler, RD) -> z_user_agent:is_crawler(RD);
get_req(_Key, _RD) -> undefined.


-spec values(#context{}) -> list({atom(), any()}).
values(Context) ->
[ {K, get(K, Context)} || K <- [
method, version, peer, is_ssl, host, raw_path, path, qs, referrer, user_agent, is_bot,
method, version, peer, is_ssl, host, raw_path, path, qs, referrer, user_agent, is_crawler,
req_id, headers, ua_class, ua_props, timezone, language
]
].
Expand Down
65 changes: 48 additions & 17 deletions src/support/z_user_agent.erl
Expand Up @@ -31,7 +31,7 @@
order_class/2,
classes/0,
classes_fallback/1,
is_bot/1
is_crawler/1
]).

-include_lib("zotonic.hrl").
Expand Down Expand Up @@ -121,14 +121,34 @@ get_ua_req_data(ReqData) ->
undefined ->
case wrq:get_req_header_lc("user-agent", ReqData) of
undefined ->
{desktop, []};
{desktop, [
{is_crawler, not is_websocket_request(ReqData)}
]};
UserAgent ->
ua_classify(UserAgent)
{Class, Props} = ua_classify(UserAgent),
case proplists:is_defined(is_crawler, Props) of
true ->
{Class, Props};
false ->
{Class, [
{is_crawler, is_crawler_ua(UserAgent)}
| Props
]}
end
end;
Class ->
{Class, []}
{Class, [{is_crawler, false}]}
end.


%% @doc Some user agents don't send the User-Agent header on websocket requests
is_websocket_request(ReqData) ->
case wrq:get_req_header_lc("upgrade", ReqData) of
undefined -> false;
"websocket" -> true;
_ -> false
end.

%% @doc We send the page's ua_class along with websocket connect requests
get_ua_class_qs(ReqData) ->
to_ua_class(wrq:get_qs_value(?SESSION_UA_CLASS_Q, ReqData)).

Expand All @@ -144,7 +164,7 @@ ua_classify(UserAgent) ->
%% Note: Do not call z_config:get(use_ua_classifier) here. Otherwise
%% every request will call z_config gen_server making it a potential
%% bottleneck.
{desktop, []};
{desktop, []};
{error, Reason} ->
error_logger:warning_msg("z_user_agent: ua_classifier returned error. [UA: ~p] [Reason: ~p]~n", [UserAgent, Reason]),
{desktop, []}
Expand Down Expand Up @@ -183,6 +203,23 @@ get_props(#wm_reqdata{} = ReqData) ->
Props -> Props
end.

%% @doc Check if the user agent is probably a bot.
-spec is_crawler(string()|binary()|#context{}|#wm_reqdata{}|undefined) -> boolean().
is_crawler(undefined) ->
false;
is_crawler(#context{} = Context) ->
{_, Props} = get_props(Context),
proplists:get_value(is_crawler, Props, false);
is_crawler(#wm_reqdata{} = RD) ->
{_, Props} = get_props(RD),
proplists:get_value(is_crawler, Props, false);
is_crawler(UA) ->
{_, Props} = ua_classify(UA),
case proplists:get_value(is_crawler, Props) of
undefined -> is_crawler_ua(UA);
IsCrawler -> IsCrawler
end.

%% @doc The user selects an user agent by hand. Update cookie and session.
-spec ua_select(ua_classifier:device_type() | automatic, #context{}) -> #context{}.
ua_select(automatic, Context) ->
Expand Down Expand Up @@ -355,17 +392,10 @@ classes_fallback(UAClass) ->
lists:dropwhile(fun(X) -> X =/= UAClass end, lists:reverse(classes())).



%% @doc Check if the user agent is probably a bot.
-spec is_bot(string()|binary()|undefined) -> boolean().
is_bot(#context{} = Context) ->
is_bot(m_req:get(user_agent, Context));
is_bot(undefined) ->
% Normal user agents all have a "User-Agent" header.
true;
is_bot(UA) when is_list(UA) ->
is_bot(z_convert:to_binary(UA));
is_bot(UA) ->
%% @doc List of hardcoded crawlers, as the ua_classifier doesn't have all crawlers.
is_crawler_ua(UA) when is_list(UA) ->
is_crawler_ua(z_convert:to_binary(UA));
is_crawler_ua(UA) when is_binary(UA) ->
UAlower = z_string:to_lower(UA),
is_bot_prefix(UAlower)
orelse lists:any(fun(Bot) ->
Expand All @@ -376,6 +406,7 @@ is_bot(UA) ->
%% @doc Quick check on prefix of the user-agent string
is_bot_prefix(<<"mozilla/", _/binary>>) -> false;
% Usual bots
is_bot_prefix(<<"curl/", _/binary>>) -> true;
is_bot_prefix(<<"facebookexternalhit", _/binary>>) -> true;
is_bot_prefix(<<"facebot", _/binary>>) -> true;
is_bot_prefix(<<"feedfetcher-google", _/binary>>) -> true;
Expand Down

0 comments on commit 89ac476

Please sign in to comment.