Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100755 108 lines (96 sloc) 3.904 kB
3f063cf twitter samples
U-REDMOND\wenmingy authored
1 set hive.exec.dynamic.partition = true;
2 set hive.exec.dynamic.partition.mode=nonstrict;
3
4 drop table twitter_temp;
5
6 create table twitter_temp
7 (
8 id bigint,
9 created_at string,
10 created_at_date string,
11 created_at_year string,
12 created_at_month string,
13 created_at_day string,
14 created_at_time string,
15 in_reply_to_user_id_str string,
16 text string,
17 contributors string,
18 retweeted string,
19 truncated string,
20 coordinates string,
21 source string,
22 retweet_count int,
23 url string,
24 hashtags array<string>,
25 user_mentions array<string>,
26 first_hashtag string,
27 first_user_mention string,
28 screen_name string,
29 name string,
30 followers_count int,
31 listed_count int,
32 friends_count int,
33 lang string,
34 user_location string,
35 time_zone string,
36 profile_image_url string,
37 json_response string
38 )
39 partitioned by (filesequence int);
40
41
42 from twitter_raw
43 insert overwrite table twitter_temp
44 partition (filesequence)
45 select
46 cast(get_json_object(json_response, '$.id_str') as bigint),
47
48 get_json_object(json_response, '$.created_at'),
49 concat(substr (get_json_object(json_response, '$.created_at'),1,10),' ',
50 substr (get_json_object(json_response, '$.created_at'),27,4)),
51
52 substr (get_json_object(json_response, '$.created_at'),27,4),
53
54 case substr (get_json_object(json_response, '$.created_at'),5,3)
55 when "Jan" then "01"
56 when "Feb" then "02"
57 when "Mar" then "03"
58 when "Apr" then "04"
59 when "May" then "05"
60 when "Jun" then "06"
61 when "Jul" then "07"
62 when "Aug" then "08"
63 when "Sep" then "09"
64 when "Oct" then "10"
65 when "Nov" then "11"
66 when "Dec" then "12" end,
67
68 substr (get_json_object(json_response, '$.created_at'),9,2),
69
70
71 substr (get_json_object(json_response, '$.created_at'),12,8),
72
73 get_json_object(json_response, '$.in_reply_to_user_id_str'),
74 get_json_object(json_response, '$.text'),
75 get_json_object(json_response, '$.contributors'),
76 get_json_object(json_response, '$.retweeted'),
77 get_json_object(json_response, '$.truncated'),
78 get_json_object(json_response, '$.coordinates'),
79 get_json_object(json_response, '$.source'),
80 cast (get_json_object(json_response, '$.retweet_count') as int),
81 get_json_object(json_response, '$.entities.display_url'),
82 array(
83 trim(lower(get_json_object(json_response, '$.entities.hashtags[0].text'))),
84 trim(lower(get_json_object(json_response, '$.entities.hashtags[1].text'))),
85 trim(lower(get_json_object(json_response, '$.entities.hashtags[2].text'))),
86 trim(lower(get_json_object(json_response, '$.entities.hashtags[3].text'))),
87 trim(lower(get_json_object(json_response, '$.entities.hashtags[4].text')))),
88 array(
89 trim(lower(get_json_object(json_response, '$.entities.user_mentions[0].screen_name'))),
90 trim(lower(get_json_object(json_response, '$.entities.user_mentions[1].screen_name'))),
91 trim(lower(get_json_object(json_response, '$.entities.user_mentions[2].screen_name'))),
92 trim(lower(get_json_object(json_response, '$.entities.user_mentions[3].screen_name'))),
93 trim(lower(get_json_object(json_response, '$.entities.user_mentions[4].screen_name')))),
94 trim(lower(get_json_object(json_response, '$.entities.hashtags[0].text'))),
95 trim(lower(get_json_object(json_response, '$.entities.user_mentions[0].screen_name'))),
96 get_json_object(json_response, '$.user.screen_name'),
97 get_json_object(json_response, '$.user.name'),
98 cast (get_json_object(json_response, '$.user.followers_count') as int),
99 cast (get_json_object(json_response, '$.user.listed_count') as int),
100 cast (get_json_object(json_response, '$.user.friends_count') as int),
101 get_json_object(json_response, '$.user.lang'),
102 get_json_object(json_response, '$.user.location'),
103 get_json_object(json_response, '$.user.time_zone'),
104 get_json_object(json_response, '$.user.profile_image_url'),
105 json_response,
106 filesequence
107 where (length(json_response) > 500);
Something went wrong with that request. Please try again.