Permalink
Browse files

twitter samples

  • Loading branch information...
1 parent 20c6da0 commit 3f063cff2cd7af99910c028cd5d9a38768584f5a U-REDMOND\wenmingy committed Jun 16, 2012
View
@@ -0,0 +1,11 @@
+use CURL to get tweet data; might take a while.
+
+use gettwitter script. edit your twitter name/password.
+
+Then, upload tweet (edit cluster name, user name, password)
+ONce data is up on hadoop cluster.
+
+You will run load_twitter_raw to create a raw table
+then run twitter_temp to do the ETL process.
+
+Finally, use the twitter query (you may have to modify for your topic.
@@ -0,0 +1 @@
+curl -d @twitter_parameters.txt -k https://stream.twitter.com/1/statuses/filter.json -u user:{password} >>twitter_stream_seq2.txt
@@ -0,0 +1,11 @@
+drop table twitter_raw;
+
+create table twitter_raw (
+ json_response string
+)
+partitioned by (filesequence int);
+
+
+load data inpath '/example/data/twitter_stream_seq8.txt'
+into table twitter_raw
+partition (filesequence = 1);

Large diffs are not rendered by default.

Oops, something went wrong.
@@ -0,0 +1,2 @@
+ select name, screen_name, followers_count from twitter_temp where name like"%Scott%" order by followers_count desc limit 20;
+ select name, screen_name, count(1) as cc from twitter_temp where text like "%Azure%" group by name,screen_name order by cc desc limit 10;
@@ -0,0 +1 @@
+track=weather,Azure,WindowsAzure,cloud
@@ -0,0 +1,107 @@
+set hive.exec.dynamic.partition = true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+
+drop table twitter_temp;
+
+create table twitter_temp
+(
+ id bigint,
+ created_at string,
+ created_at_date string,
+ created_at_year string,
+ created_at_month string,
+ created_at_day string,
+ created_at_time string,
+ in_reply_to_user_id_str string,
+ text string,
+ contributors string,
+ retweeted string,
+ truncated string,
+ coordinates string,
+ source string,
+ retweet_count int,
+ url string,
+ hashtags array<string>,
+ user_mentions array<string>,
+ first_hashtag string,
+ first_user_mention string,
+ screen_name string,
+ name string,
+ followers_count int,
+ listed_count int,
+ friends_count int,
+ lang string,
+ user_location string,
+ time_zone string,
+ profile_image_url string,
+ json_response string
+)
+partitioned by (filesequence int);
+
+
+from twitter_raw
+insert overwrite table twitter_temp
+partition (filesequence)
+select
+ cast(get_json_object(json_response, '$.id_str') as bigint),
+
+ get_json_object(json_response, '$.created_at'),
+ concat(substr (get_json_object(json_response, '$.created_at'),1,10),' ',
+ substr (get_json_object(json_response, '$.created_at'),27,4)),
+
+ substr (get_json_object(json_response, '$.created_at'),27,4),
+
+ case substr (get_json_object(json_response, '$.created_at'),5,3)
+ when "Jan" then "01"
+ when "Feb" then "02"
+ when "Mar" then "03"
+ when "Apr" then "04"
+ when "May" then "05"
+ when "Jun" then "06"
+ when "Jul" then "07"
+ when "Aug" then "08"
+ when "Sep" then "09"
+ when "Oct" then "10"
+ when "Nov" then "11"
+ when "Dec" then "12" end,
+
+ substr (get_json_object(json_response, '$.created_at'),9,2),
+
+
+ substr (get_json_object(json_response, '$.created_at'),12,8),
+
+ get_json_object(json_response, '$.in_reply_to_user_id_str'),
+ get_json_object(json_response, '$.text'),
+ get_json_object(json_response, '$.contributors'),
+ get_json_object(json_response, '$.retweeted'),
+ get_json_object(json_response, '$.truncated'),
+ get_json_object(json_response, '$.coordinates'),
+ get_json_object(json_response, '$.source'),
+ cast (get_json_object(json_response, '$.retweet_count') as int),
+ get_json_object(json_response, '$.entities.display_url'),
+ array(
+ trim(lower(get_json_object(json_response, '$.entities.hashtags[0].text'))),
+ trim(lower(get_json_object(json_response, '$.entities.hashtags[1].text'))),
+ trim(lower(get_json_object(json_response, '$.entities.hashtags[2].text'))),
+ trim(lower(get_json_object(json_response, '$.entities.hashtags[3].text'))),
+ trim(lower(get_json_object(json_response, '$.entities.hashtags[4].text')))),
+ array(
+ trim(lower(get_json_object(json_response, '$.entities.user_mentions[0].screen_name'))),
+ trim(lower(get_json_object(json_response, '$.entities.user_mentions[1].screen_name'))),
+ trim(lower(get_json_object(json_response, '$.entities.user_mentions[2].screen_name'))),
+ trim(lower(get_json_object(json_response, '$.entities.user_mentions[3].screen_name'))),
+ trim(lower(get_json_object(json_response, '$.entities.user_mentions[4].screen_name')))),
+ trim(lower(get_json_object(json_response, '$.entities.hashtags[0].text'))),
+ trim(lower(get_json_object(json_response, '$.entities.user_mentions[0].screen_name'))),
+ get_json_object(json_response, '$.user.screen_name'),
+ get_json_object(json_response, '$.user.name'),
+ cast (get_json_object(json_response, '$.user.followers_count') as int),
+ cast (get_json_object(json_response, '$.user.listed_count') as int),
+ cast (get_json_object(json_response, '$.user.friends_count') as int),
+ get_json_object(json_response, '$.user.lang'),
+ get_json_object(json_response, '$.user.location'),
+ get_json_object(json_response, '$.user.time_zone'),
+ get_json_object(json_response, '$.user.profile_image_url'),
+ json_response,
+ filesequence
+where (length(json_response) > 500);
@@ -0,0 +1,9 @@
+$serverName = "yourhadooponazurecluster.cloudapp.net"; $userName = "yourclusterusername";
+$password = "{password}";
+$fileToUpload = "t2.txt"; $destination = "/example/data/";
+$Md5Hasher = [System.Security.Cryptography.MD5]::Create();
+$hashBytes = $Md5Hasher.ComputeHash($([Char[]]$password))
+foreach ($byte in $hashBytes) { $passwordHash += "{0:x2}" -f $byte }
+$curlCmd = ".\curl -k --ftp-create-dirs -T $fileToUpload -u $userName"
+$curlCmd += ":$passwordHash ftps://$serverName" + ":2226$destination"
+invoke-expression $curlCmd

0 comments on commit 3f063cf

Please sign in to comment.