diff --git a/doc/ref/models/model_req.rst b/doc/ref/models/model_req.rst index 1f46749f7b..8dd7d13832 100644 --- a/doc/ref/models/model_req.rst +++ b/doc/ref/models/model_req.rst @@ -36,7 +36,7 @@ This will show something like:: {qs,[{"foo","bar"}]}, {referrer,"http://test.dev:8000/"}, {user_agent,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/601.4.4 (KHTML, like Gecko) Version/9.0.3 Safari/601.4.4"}, - {is_bot, false}, + {is_crawler,false}, {req_id,525158920}, {headers,[{"accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"}, diff --git a/src/models/m_req.erl b/src/models/m_req.erl index 871e59064a..c61a036907 100644 --- a/src/models/m_req.erl +++ b/src/models/m_req.erl @@ -60,6 +60,7 @@ get(ua_class, #context{} = Context) -> z_user_agent:get_class(Context); get(ua_props, #context{} = Context) -> z_user_agent:get_props(Context); get(timezone, #context{} = Context) -> z_context:tz(Context); get(language, #context{} = Context) -> z_context:language(Context); +get(is_crawler, #context{} = Context) -> z_user_agent:is_crawler(Context); get(What, #context{} = Context) -> get_req(What, z_context:get_reqdata(Context)); get(What, #wm_reqdata{} = RD) -> get_req(What, RD). @@ -77,14 +78,14 @@ get_req(user_agent, RD) -> wrq:get_req_header_lc("user-agent", RD); get_req(referer, RD) -> wrq:get_req_header_lc("referer", RD); get_req(referrer, RD) -> wrq:get_req_header_lc("referer", RD); get_req(req_id, #wm_reqdata{log_data=#wm_log_data{req_id=ReqId}}) -> ReqId; -get_req(is_bot, RD) -> z_user_agent:is_bot(get_req(user_agent, RD)); +get_req(is_crawler, RD) -> z_user_agent:is_crawler(RD); get_req(_Key, _RD) -> undefined. -spec values(#context{}) -> list({atom(), any()}). values(Context) -> [ {K, get(K, Context)} || K <- [ - method, version, peer, is_ssl, host, raw_path, path, qs, referrer, user_agent, is_bot, + method, version, peer, is_ssl, host, raw_path, path, qs, referrer, user_agent, is_crawler, req_id, headers, ua_class, ua_props, timezone, language ] ]. diff --git a/src/support/z_user_agent.erl b/src/support/z_user_agent.erl index 5339ddacfd..39e02c1c26 100644 --- a/src/support/z_user_agent.erl +++ b/src/support/z_user_agent.erl @@ -31,7 +31,7 @@ order_class/2, classes/0, classes_fallback/1, - is_bot/1 + is_crawler/1 ]). -include_lib("zotonic.hrl"). @@ -121,14 +121,34 @@ get_ua_req_data(ReqData) -> undefined -> case wrq:get_req_header_lc("user-agent", ReqData) of undefined -> - {desktop, []}; + {desktop, [ + {is_crawler, not is_websocket_request(ReqData)} + ]}; UserAgent -> - ua_classify(UserAgent) + {Class, Props} = ua_classify(UserAgent), + case proplists:is_defined(is_crawler, Props) of + true -> + {Class, Props}; + false -> + {Class, [ + {is_crawler, is_crawler_ua(UserAgent)} + | Props + ]} + end end; Class -> - {Class, []} + {Class, [{is_crawler, false}]} end. - + +%% @doc Some user agents don't send the User-Agent header on websocket requests +is_websocket_request(ReqData) -> + case wrq:get_req_header_lc("upgrade", ReqData) of + undefined -> false; + "websocket" -> true; + _ -> false + end. + +%% @doc We send the page's ua_class along with websocket connect requests get_ua_class_qs(ReqData) -> to_ua_class(wrq:get_qs_value(?SESSION_UA_CLASS_Q, ReqData)). @@ -144,7 +164,7 @@ ua_classify(UserAgent) -> %% Note: Do not call z_config:get(use_ua_classifier) here. Otherwise %% every request will call z_config gen_server making it a potential %% bottleneck. - {desktop, []}; + {desktop, []}; {error, Reason} -> error_logger:warning_msg("z_user_agent: ua_classifier returned error. [UA: ~p] [Reason: ~p]~n", [UserAgent, Reason]), {desktop, []} @@ -183,6 +203,23 @@ get_props(#wm_reqdata{} = ReqData) -> Props -> Props end. +%% @doc Check if the user agent is probably a bot. +-spec is_crawler(string()|binary()|#context{}|#wm_reqdata{}|undefined) -> boolean(). +is_crawler(undefined) -> + false; +is_crawler(#context{} = Context) -> + {_, Props} = get_props(Context), + proplists:get_value(is_crawler, Props, false); +is_crawler(#wm_reqdata{} = RD) -> + {_, Props} = get_props(RD), + proplists:get_value(is_crawler, Props, false); +is_crawler(UA) -> + {_, Props} = ua_classify(UA), + case proplists:get_value(is_crawler, Props) of + undefined -> is_crawler_ua(UA); + IsCrawler -> IsCrawler + end. + %% @doc The user selects an user agent by hand. Update cookie and session. -spec ua_select(ua_classifier:device_type() | automatic, #context{}) -> #context{}. ua_select(automatic, Context) -> @@ -355,17 +392,10 @@ classes_fallback(UAClass) -> lists:dropwhile(fun(X) -> X =/= UAClass end, lists:reverse(classes())). - -%% @doc Check if the user agent is probably a bot. --spec is_bot(string()|binary()|undefined) -> boolean(). -is_bot(#context{} = Context) -> - is_bot(m_req:get(user_agent, Context)); -is_bot(undefined) -> - % Normal user agents all have a "User-Agent" header. - true; -is_bot(UA) when is_list(UA) -> - is_bot(z_convert:to_binary(UA)); -is_bot(UA) -> +%% @doc List of hardcoded crawlers, as the ua_classifier doesn't have all crawlers. +is_crawler_ua(UA) when is_list(UA) -> + is_crawler_ua(z_convert:to_binary(UA)); +is_crawler_ua(UA) when is_binary(UA) -> UAlower = z_string:to_lower(UA), is_bot_prefix(UAlower) orelse lists:any(fun(Bot) -> @@ -376,6 +406,7 @@ is_bot(UA) -> %% @doc Quick check on prefix of the user-agent string is_bot_prefix(<<"mozilla/", _/binary>>) -> false; % Usual bots +is_bot_prefix(<<"curl/", _/binary>>) -> true; is_bot_prefix(<<"facebookexternalhit", _/binary>>) -> true; is_bot_prefix(<<"facebot", _/binary>>) -> true; is_bot_prefix(<<"feedfetcher-google", _/binary>>) -> true;